diff options
Diffstat (limited to 'capstone/suite/synctools/tablegen/X86/back')
44 files changed, 56080 insertions, 0 deletions
diff --git a/capstone/suite/synctools/tablegen/X86/back/X86.td b/capstone/suite/synctools/tablegen/X86/back/X86.td new file mode 100644 index 000000000..63c2dc4da --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86.td @@ -0,0 +1,1203 @@ +//===-- X86.td - Target definition file for the Intel X86 --*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This is a target description file for the Intel i386 architecture, referred +// to here as the "X86" architecture. +// +//===----------------------------------------------------------------------===// + +// Get the target-independent interfaces which we are implementing... +// +include "llvm/Target/Target.td" + +//===----------------------------------------------------------------------===// +// X86 Subtarget state +// + +def Mode64Bit : SubtargetFeature<"64bit-mode", "In64BitMode", "true", + "64-bit mode (x86_64)">; +def Mode32Bit : SubtargetFeature<"32bit-mode", "In32BitMode", "true", + "32-bit mode (80386)">; +def Mode16Bit : SubtargetFeature<"16bit-mode", "In16BitMode", "true", + "16-bit mode (i8086)">; + +//===----------------------------------------------------------------------===// +// X86 Subtarget features +//===----------------------------------------------------------------------===// + +def FeatureX87 : SubtargetFeature<"x87","HasX87", "true", + "Enable X87 float instructions">; + +def FeatureNOPL : SubtargetFeature<"nopl", "HasNOPL", "true", + "Enable NOPL instruction">; + +def FeatureCMOV : SubtargetFeature<"cmov","HasCMov", "true", + "Enable conditional move instructions">; + +def FeaturePOPCNT : SubtargetFeature<"popcnt", "HasPOPCNT", "true", + "Support POPCNT instruction">; + +def FeatureFXSR : SubtargetFeature<"fxsr", "HasFXSR", "true", + "Support fxsave/fxrestore instructions">; + +def FeatureXSAVE : SubtargetFeature<"xsave", "HasXSAVE", "true", + "Support xsave instructions">; + +def FeatureXSAVEOPT: SubtargetFeature<"xsaveopt", "HasXSAVEOPT", "true", + "Support xsaveopt instructions">; + +def FeatureXSAVEC : SubtargetFeature<"xsavec", "HasXSAVEC", "true", + "Support xsavec instructions">; + +def FeatureXSAVES : SubtargetFeature<"xsaves", "HasXSAVES", "true", + "Support xsaves instructions">; + +def FeatureSSE1 : SubtargetFeature<"sse", "X86SSELevel", "SSE1", + "Enable SSE instructions", + // SSE codegen depends on cmovs, and all + // SSE1+ processors support them. + [FeatureCMOV]>; +def FeatureSSE2 : SubtargetFeature<"sse2", "X86SSELevel", "SSE2", + "Enable SSE2 instructions", + [FeatureSSE1]>; +def FeatureSSE3 : SubtargetFeature<"sse3", "X86SSELevel", "SSE3", + "Enable SSE3 instructions", + [FeatureSSE2]>; +def FeatureSSSE3 : SubtargetFeature<"ssse3", "X86SSELevel", "SSSE3", + "Enable SSSE3 instructions", + [FeatureSSE3]>; +def FeatureSSE41 : SubtargetFeature<"sse4.1", "X86SSELevel", "SSE41", + "Enable SSE 4.1 instructions", + [FeatureSSSE3]>; +def FeatureSSE42 : SubtargetFeature<"sse4.2", "X86SSELevel", "SSE42", + "Enable SSE 4.2 instructions", + [FeatureSSE41]>; +// The MMX subtarget feature is separate from the rest of the SSE features +// because it's important (for odd compatibility reasons) to be able to +// turn it off explicitly while allowing SSE+ to be on. +def FeatureMMX : SubtargetFeature<"mmx","X863DNowLevel", "MMX", + "Enable MMX instructions">; +def Feature3DNow : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow", + "Enable 3DNow! instructions", + [FeatureMMX]>; +def Feature3DNowA : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA", + "Enable 3DNow! Athlon instructions", + [Feature3DNow]>; +// All x86-64 hardware has SSE2, but we don't mark SSE2 as an implied +// feature, because SSE2 can be disabled (e.g. for compiling OS kernels) +// without disabling 64-bit mode. +def Feature64Bit : SubtargetFeature<"64bit", "HasX86_64", "true", + "Support 64-bit instructions", + [FeatureCMOV]>; +def FeatureCMPXCHG16B : SubtargetFeature<"cx16", "HasCmpxchg16b", "true", + "64-bit with cmpxchg16b", + [Feature64Bit]>; +def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true", + "SHLD instruction is slow">; +def FeatureSlowPMULLD : SubtargetFeature<"slow-pmulld", "IsPMULLDSlow", "true", + "PMULLD instruction is slow">; +// FIXME: This should not apply to CPUs that do not have SSE. +def FeatureSlowUAMem16 : SubtargetFeature<"slow-unaligned-mem-16", + "IsUAMem16Slow", "true", + "Slow unaligned 16-byte memory access">; +def FeatureSlowUAMem32 : SubtargetFeature<"slow-unaligned-mem-32", + "IsUAMem32Slow", "true", + "Slow unaligned 32-byte memory access">; +def FeatureSSE4A : SubtargetFeature<"sse4a", "HasSSE4A", "true", + "Support SSE 4a instructions", + [FeatureSSE3]>; + +def FeatureAVX : SubtargetFeature<"avx", "X86SSELevel", "AVX", + "Enable AVX instructions", + [FeatureSSE42]>; +def FeatureAVX2 : SubtargetFeature<"avx2", "X86SSELevel", "AVX2", + "Enable AVX2 instructions", + [FeatureAVX]>; +def FeatureFMA : SubtargetFeature<"fma", "HasFMA", "true", + "Enable three-operand fused multiple-add", + [FeatureAVX]>; +def FeatureF16C : SubtargetFeature<"f16c", "HasF16C", "true", + "Support 16-bit floating point conversion instructions", + [FeatureAVX]>; +def FeatureAVX512 : SubtargetFeature<"avx512f", "X86SSELevel", "AVX512F", + "Enable AVX-512 instructions", + [FeatureAVX2, FeatureFMA, FeatureF16C]>; +def FeatureERI : SubtargetFeature<"avx512er", "HasERI", "true", + "Enable AVX-512 Exponential and Reciprocal Instructions", + [FeatureAVX512]>; +def FeatureCDI : SubtargetFeature<"avx512cd", "HasCDI", "true", + "Enable AVX-512 Conflict Detection Instructions", + [FeatureAVX512]>; +def FeatureVPOPCNTDQ : SubtargetFeature<"avx512vpopcntdq", "HasVPOPCNTDQ", + "true", "Enable AVX-512 Population Count Instructions", + [FeatureAVX512]>; +def FeaturePFI : SubtargetFeature<"avx512pf", "HasPFI", "true", + "Enable AVX-512 PreFetch Instructions", + [FeatureAVX512]>; +def FeaturePREFETCHWT1 : SubtargetFeature<"prefetchwt1", "HasPREFETCHWT1", + "true", + "Prefetch with Intent to Write and T1 Hint">; +def FeatureDQI : SubtargetFeature<"avx512dq", "HasDQI", "true", + "Enable AVX-512 Doubleword and Quadword Instructions", + [FeatureAVX512]>; +def FeatureBWI : SubtargetFeature<"avx512bw", "HasBWI", "true", + "Enable AVX-512 Byte and Word Instructions", + [FeatureAVX512]>; +def FeatureVLX : SubtargetFeature<"avx512vl", "HasVLX", "true", + "Enable AVX-512 Vector Length eXtensions", + [FeatureAVX512]>; +def FeatureVBMI : SubtargetFeature<"avx512vbmi", "HasVBMI", "true", + "Enable AVX-512 Vector Byte Manipulation Instructions", + [FeatureBWI]>; +def FeatureVBMI2 : SubtargetFeature<"avx512vbmi2", "HasVBMI2", "true", + "Enable AVX-512 further Vector Byte Manipulation Instructions", + [FeatureBWI]>; +def FeatureIFMA : SubtargetFeature<"avx512ifma", "HasIFMA", "true", + "Enable AVX-512 Integer Fused Multiple-Add", + [FeatureAVX512]>; +def FeaturePKU : SubtargetFeature<"pku", "HasPKU", "true", + "Enable protection keys">; +def FeatureVNNI : SubtargetFeature<"avx512vnni", "HasVNNI", "true", + "Enable AVX-512 Vector Neural Network Instructions", + [FeatureAVX512]>; +def FeatureBITALG : SubtargetFeature<"avx512bitalg", "HasBITALG", "true", + "Enable AVX-512 Bit Algorithms", + [FeatureBWI]>; +def FeaturePCLMUL : SubtargetFeature<"pclmul", "HasPCLMUL", "true", + "Enable packed carry-less multiplication instructions", + [FeatureSSE2]>; +def FeatureGFNI : SubtargetFeature<"gfni", "HasGFNI", "true", + "Enable Galois Field Arithmetic Instructions", + [FeatureSSE2]>; +def FeatureVPCLMULQDQ : SubtargetFeature<"vpclmulqdq", "HasVPCLMULQDQ", "true", + "Enable vpclmulqdq instructions", + [FeatureAVX, FeaturePCLMUL]>; +def FeatureFMA4 : SubtargetFeature<"fma4", "HasFMA4", "true", + "Enable four-operand fused multiple-add", + [FeatureAVX, FeatureSSE4A]>; +def FeatureXOP : SubtargetFeature<"xop", "HasXOP", "true", + "Enable XOP instructions", + [FeatureFMA4]>; +def FeatureSSEUnalignedMem : SubtargetFeature<"sse-unaligned-mem", + "HasSSEUnalignedMem", "true", + "Allow unaligned memory operands with SSE instructions">; +def FeatureAES : SubtargetFeature<"aes", "HasAES", "true", + "Enable AES instructions", + [FeatureSSE2]>; +def FeatureVAES : SubtargetFeature<"vaes", "HasVAES", "true", + "Promote selected AES instructions to AVX512/AVX registers", + [FeatureAVX, FeatureAES]>; +def FeatureTBM : SubtargetFeature<"tbm", "HasTBM", "true", + "Enable TBM instructions">; +def FeatureLWP : SubtargetFeature<"lwp", "HasLWP", "true", + "Enable LWP instructions">; +def FeatureMOVBE : SubtargetFeature<"movbe", "HasMOVBE", "true", + "Support MOVBE instruction">; +def FeatureRDRAND : SubtargetFeature<"rdrnd", "HasRDRAND", "true", + "Support RDRAND instruction">; +def FeatureFSGSBase : SubtargetFeature<"fsgsbase", "HasFSGSBase", "true", + "Support FS/GS Base instructions">; +def FeatureLZCNT : SubtargetFeature<"lzcnt", "HasLZCNT", "true", + "Support LZCNT instruction">; +def FeatureBMI : SubtargetFeature<"bmi", "HasBMI", "true", + "Support BMI instructions">; +def FeatureBMI2 : SubtargetFeature<"bmi2", "HasBMI2", "true", + "Support BMI2 instructions">; +def FeatureRTM : SubtargetFeature<"rtm", "HasRTM", "true", + "Support RTM instructions">; +def FeatureADX : SubtargetFeature<"adx", "HasADX", "true", + "Support ADX instructions">; +def FeatureSHA : SubtargetFeature<"sha", "HasSHA", "true", + "Enable SHA instructions", + [FeatureSSE2]>; +def FeatureSHSTK : SubtargetFeature<"shstk", "HasSHSTK", "true", + "Support CET Shadow-Stack instructions">; +def FeaturePRFCHW : SubtargetFeature<"prfchw", "HasPRFCHW", "true", + "Support PRFCHW instructions">; +def FeatureRDSEED : SubtargetFeature<"rdseed", "HasRDSEED", "true", + "Support RDSEED instruction">; +def FeatureLAHFSAHF : SubtargetFeature<"sahf", "HasLAHFSAHF", "true", + "Support LAHF and SAHF instructions">; +def FeatureMWAITX : SubtargetFeature<"mwaitx", "HasMWAITX", "true", + "Enable MONITORX/MWAITX timer functionality">; +def FeatureCLZERO : SubtargetFeature<"clzero", "HasCLZERO", "true", + "Enable Cache Line Zero">; +def FeatureCLDEMOTE : SubtargetFeature<"cldemote", "HasCLDEMOTE", "true", + "Enable Cache Demote">; +def FeaturePTWRITE : SubtargetFeature<"ptwrite", "HasPTWRITE", "true", + "Support ptwrite instruction">; +def FeatureMPX : SubtargetFeature<"mpx", "HasMPX", "true", + "Support MPX instructions">; +def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true", + "Use LEA for adjusting the stack pointer">; +def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb", + "HasSlowDivide32", "true", + "Use 8-bit divide for positive values less than 256">; +def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divl", + "HasSlowDivide64", "true", + "Use 32-bit divide for positive values less than 2^32">; +def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions", + "PadShortFunctions", "true", + "Pad short functions">; +def FeatureINVPCID : SubtargetFeature<"invpcid", "HasINVPCID", "true", + "Invalidate Process-Context Identifier">; +def FeatureSGX : SubtargetFeature<"sgx", "HasSGX", "true", + "Enable Software Guard Extensions">; +def FeatureCLFLUSHOPT : SubtargetFeature<"clflushopt", "HasCLFLUSHOPT", "true", + "Flush A Cache Line Optimized">; +def FeatureCLWB : SubtargetFeature<"clwb", "HasCLWB", "true", + "Cache Line Write Back">; +def FeatureWBNOINVD : SubtargetFeature<"wbnoinvd", "HasWBNOINVD", "true", + "Write Back No Invalidate">; +def FeatureRDPID : SubtargetFeature<"rdpid", "HasRDPID", "true", + "Support RDPID instructions">; +def FeatureWAITPKG : SubtargetFeature<"waitpkg", "HasWAITPKG", "true", + "Wait and pause enhancements">; +// On some processors, instructions that implicitly take two memory operands are +// slow. In practice, this means that CALL, PUSH, and POP with memory operands +// should be avoided in favor of a MOV + register CALL/PUSH/POP. +def FeatureSlowTwoMemOps : SubtargetFeature<"slow-two-mem-ops", + "SlowTwoMemOps", "true", + "Two memory operand instructions are slow">; +def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true", + "LEA instruction needs inputs at AG stage">; +def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true", + "LEA instruction with certain arguments is slow">; +def FeatureSlow3OpsLEA : SubtargetFeature<"slow-3ops-lea", "Slow3OpsLEA", "true", + "LEA instruction with 3 ops or certain registers is slow">; +def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true", + "INC and DEC instructions are slower than ADD and SUB">; +def FeatureSoftFloat + : SubtargetFeature<"soft-float", "UseSoftFloat", "true", + "Use software floating point features.">; +def FeaturePOPCNTFalseDeps : SubtargetFeature<"false-deps-popcnt", + "HasPOPCNTFalseDeps", "true", + "POPCNT has a false dependency on dest register">; +def FeatureLZCNTFalseDeps : SubtargetFeature<"false-deps-lzcnt-tzcnt", + "HasLZCNTFalseDeps", "true", + "LZCNT/TZCNT have a false dependency on dest register">; +def FeaturePCONFIG : SubtargetFeature<"pconfig", "HasPCONFIG", "true", + "platform configuration instruction">; +// On recent X86 (port bound) processors, its preferable to combine to a single shuffle +// using a variable mask over multiple fixed shuffles. +def FeatureFastVariableShuffle + : SubtargetFeature<"fast-variable-shuffle", + "HasFastVariableShuffle", + "true", "Shuffles with variable masks are fast">; +// On some X86 processors, there is no performance hazard to writing only the +// lower parts of a YMM or ZMM register without clearing the upper part. +def FeatureFastPartialYMMorZMMWrite + : SubtargetFeature<"fast-partial-ymm-or-zmm-write", + "HasFastPartialYMMorZMMWrite", + "true", "Partial writes to YMM/ZMM registers are fast">; +// FeatureFastScalarFSQRT should be enabled if scalar FSQRT has shorter latency +// than the corresponding NR code. FeatureFastVectorFSQRT should be enabled if +// vector FSQRT has higher throughput than the corresponding NR code. +// The idea is that throughput bound code is likely to be vectorized, so for +// vectorized code we should care about the throughput of SQRT operations. +// But if the code is scalar that probably means that the code has some kind of +// dependency and we should care more about reducing the latency. +def FeatureFastScalarFSQRT + : SubtargetFeature<"fast-scalar-fsqrt", "HasFastScalarFSQRT", + "true", "Scalar SQRT is fast (disable Newton-Raphson)">; +def FeatureFastVectorFSQRT + : SubtargetFeature<"fast-vector-fsqrt", "HasFastVectorFSQRT", + "true", "Vector SQRT is fast (disable Newton-Raphson)">; +// If lzcnt has equivalent latency/throughput to most simple integer ops, it can +// be used to replace test/set sequences. +def FeatureFastLZCNT + : SubtargetFeature< + "fast-lzcnt", "HasFastLZCNT", "true", + "LZCNT instructions are as fast as most simple integer ops">; +// If the target can efficiently decode NOPs upto 11-bytes in length. +def FeatureFast11ByteNOP + : SubtargetFeature< + "fast-11bytenop", "HasFast11ByteNOP", "true", + "Target can quickly decode up to 11 byte NOPs">; +// If the target can efficiently decode NOPs upto 15-bytes in length. +def FeatureFast15ByteNOP + : SubtargetFeature< + "fast-15bytenop", "HasFast15ByteNOP", "true", + "Target can quickly decode up to 15 byte NOPs">; +// Sandy Bridge and newer processors can use SHLD with the same source on both +// inputs to implement rotate to avoid the partial flag update of the normal +// rotate instructions. +def FeatureFastSHLDRotate + : SubtargetFeature< + "fast-shld-rotate", "HasFastSHLDRotate", "true", + "SHLD can be used as a faster rotate">; + +// Ivy Bridge and newer processors have enhanced REP MOVSB and STOSB (aka +// "string operations"). See "REP String Enhancement" in the Intel Software +// Development Manual. This feature essentially means that REP MOVSB will copy +// using the largest available size instead of copying bytes one by one, making +// it at least as fast as REPMOVS{W,D,Q}. +def FeatureERMSB + : SubtargetFeature< + "ermsb", "HasERMSB", "true", + "REP MOVS/STOS are fast">; + +// Sandy Bridge and newer processors have many instructions that can be +// fused with conditional branches and pass through the CPU as a single +// operation. +def FeatureMacroFusion + : SubtargetFeature<"macrofusion", "HasMacroFusion", "true", + "Various instructions can be fused with conditional branches">; + +// Gather is available since Haswell (AVX2 set). So technically, we can +// generate Gathers on all AVX2 processors. But the overhead on HSW is high. +// Skylake Client processor has faster Gathers than HSW and performance is +// similar to Skylake Server (AVX-512). +def FeatureHasFastGather + : SubtargetFeature<"fast-gather", "HasFastGather", "true", + "Indicates if gather is reasonably fast.">; + +def FeaturePrefer256Bit + : SubtargetFeature<"prefer-256-bit", "Prefer256Bit", "true", + "Prefer 256-bit AVX instructions">; + +// Enable mitigation of some aspects of speculative execution related +// vulnerabilities by removing speculatable indirect branches. This disables +// jump-table formation, rewrites explicit `indirectbr` instructions into +// `switch` instructions, and uses a special construct called a "retpoline" to +// prevent speculation of the remaining indirect branches (indirect calls and +// tail calls). +def FeatureRetpoline + : SubtargetFeature<"retpoline", "UseRetpoline", "true", + "Remove speculation of indirect branches from the " + "generated code, either by avoiding them entirely or " + "lowering them with a speculation blocking construct.">; + +// Rely on external thunks for the emitted retpoline calls. This allows users +// to provide their own custom thunk definitions in highly specialized +// environments such as a kernel that does boot-time hot patching. +def FeatureRetpolineExternalThunk + : SubtargetFeature< + "retpoline-external-thunk", "UseRetpolineExternalThunk", "true", + "Enable retpoline, but with an externally provided thunk.", + [FeatureRetpoline]>; + +// Direct Move instructions. +def FeatureMOVDIRI : SubtargetFeature<"movdiri", "HasMOVDIRI", "true", + "Support movdiri instruction">; +def FeatureMOVDIR64B : SubtargetFeature<"movdir64b", "HasMOVDIR64B", "true", + "Support movdir64b instruction">; + +//===----------------------------------------------------------------------===// +// Register File Description +//===----------------------------------------------------------------------===// + +include "X86RegisterInfo.td" +include "X86RegisterBanks.td" + +//===----------------------------------------------------------------------===// +// Instruction Descriptions +//===----------------------------------------------------------------------===// + +include "X86Schedule.td" +include "X86InstrInfo.td" +include "X86SchedPredicates.td" + +def X86InstrInfo : InstrInfo; + +//===----------------------------------------------------------------------===// +// X86 processors supported. +//===----------------------------------------------------------------------===// + +include "X86ScheduleAtom.td" +include "X86SchedSandyBridge.td" +include "X86SchedHaswell.td" +include "X86SchedBroadwell.td" +include "X86ScheduleSLM.td" +include "X86ScheduleZnver1.td" +include "X86ScheduleBtVer2.td" +include "X86SchedSkylakeClient.td" +include "X86SchedSkylakeServer.td" + +def ProcIntelAtom : SubtargetFeature<"atom", "X86ProcFamily", "IntelAtom", + "Intel Atom processors">; +def ProcIntelSLM : SubtargetFeature<"slm", "X86ProcFamily", "IntelSLM", + "Intel Silvermont processors">; +def ProcIntelGLM : SubtargetFeature<"glm", "X86ProcFamily", "IntelGLM", + "Intel Goldmont processors">; +def ProcIntelGLP : SubtargetFeature<"glp", "X86ProcFamily", "IntelGLP", + "Intel Goldmont Plus processors">; +def ProcIntelTRM : SubtargetFeature<"tremont", "X86ProcFamily", "IntelTRM", + "Intel Tremont processors">; +def ProcIntelHSW : SubtargetFeature<"haswell", "X86ProcFamily", + "IntelHaswell", "Intel Haswell processors">; +def ProcIntelBDW : SubtargetFeature<"broadwell", "X86ProcFamily", + "IntelBroadwell", "Intel Broadwell processors">; +def ProcIntelSKL : SubtargetFeature<"skylake", "X86ProcFamily", + "IntelSkylake", "Intel Skylake processors">; +def ProcIntelKNL : SubtargetFeature<"knl", "X86ProcFamily", + "IntelKNL", "Intel Knights Landing processors">; +def ProcIntelSKX : SubtargetFeature<"skx", "X86ProcFamily", + "IntelSKX", "Intel Skylake Server processors">; +def ProcIntelCNL : SubtargetFeature<"cannonlake", "X86ProcFamily", + "IntelCannonlake", "Intel Cannonlake processors">; +def ProcIntelICL : SubtargetFeature<"icelake-client", "X86ProcFamily", + "IntelIcelakeClient", "Intel Icelake processors">; +def ProcIntelICX : SubtargetFeature<"icelake-server", "X86ProcFamily", + "IntelIcelakeServer", "Intel Icelake Server processors">; + +class Proc<string Name, list<SubtargetFeature> Features> + : ProcessorModel<Name, GenericModel, Features>; + +def : Proc<"generic", [FeatureX87, FeatureSlowUAMem16]>; +def : Proc<"i386", [FeatureX87, FeatureSlowUAMem16]>; +def : Proc<"i486", [FeatureX87, FeatureSlowUAMem16]>; +def : Proc<"i586", [FeatureX87, FeatureSlowUAMem16]>; +def : Proc<"pentium", [FeatureX87, FeatureSlowUAMem16]>; +def : Proc<"pentium-mmx", [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>; + +def : Proc<"i686", [FeatureX87, FeatureSlowUAMem16, FeatureCMOV]>; +def : Proc<"pentiumpro", [FeatureX87, FeatureSlowUAMem16, FeatureCMOV, + FeatureNOPL]>; + +def : Proc<"pentium2", [FeatureX87, FeatureSlowUAMem16, FeatureMMX, + FeatureCMOV, FeatureFXSR, FeatureNOPL]>; + +foreach P = ["pentium3", "pentium3m"] in { + def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE1, + FeatureFXSR, FeatureNOPL]>; +} + +// Enable the PostRAScheduler for SSE2 and SSE3 class cpus. +// The intent is to enable it for pentium4 which is the current default +// processor in a vanilla 32-bit clang compilation when no specific +// architecture is specified. This generally gives a nice performance +// increase on silvermont, with largely neutral behavior on other +// contemporary large core processors. +// pentium-m, pentium4m, prescott and nocona are included as a preventative +// measure to avoid performance surprises, in case clang's default cpu +// changes slightly. + +def : ProcessorModel<"pentium-m", GenericPostRAModel, + [FeatureX87, FeatureSlowUAMem16, FeatureMMX, + FeatureSSE2, FeatureFXSR, FeatureNOPL]>; + +foreach P = ["pentium4", "pentium4m"] in { + def : ProcessorModel<P, GenericPostRAModel, + [FeatureX87, FeatureSlowUAMem16, FeatureMMX, + FeatureSSE2, FeatureFXSR, FeatureNOPL]>; +} + +// Intel Quark. +def : Proc<"lakemont", []>; + +// Intel Core Duo. +def : ProcessorModel<"yonah", SandyBridgeModel, + [FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE3, + FeatureFXSR, FeatureNOPL]>; + +// NetBurst. +def : ProcessorModel<"prescott", GenericPostRAModel, + [FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE3, + FeatureFXSR, FeatureNOPL]>; +def : ProcessorModel<"nocona", GenericPostRAModel, [ + FeatureX87, + FeatureSlowUAMem16, + FeatureMMX, + FeatureSSE3, + FeatureFXSR, + FeatureNOPL, + FeatureCMPXCHG16B +]>; + +// Intel Core 2 Solo/Duo. +def : ProcessorModel<"core2", SandyBridgeModel, [ + FeatureX87, + FeatureSlowUAMem16, + FeatureMMX, + FeatureSSSE3, + FeatureFXSR, + FeatureNOPL, + FeatureCMPXCHG16B, + FeatureLAHFSAHF, + FeatureMacroFusion +]>; +def : ProcessorModel<"penryn", SandyBridgeModel, [ + FeatureX87, + FeatureSlowUAMem16, + FeatureMMX, + FeatureSSE41, + FeatureFXSR, + FeatureNOPL, + FeatureCMPXCHG16B, + FeatureLAHFSAHF, + FeatureMacroFusion +]>; + +// Atom CPUs. +class BonnellProc<string Name> : ProcessorModel<Name, AtomModel, [ + ProcIntelAtom, + FeatureX87, + FeatureSlowUAMem16, + FeatureMMX, + FeatureSSSE3, + FeatureFXSR, + FeatureNOPL, + FeatureCMPXCHG16B, + FeatureMOVBE, + FeatureLEAForSP, + FeatureSlowDivide32, + FeatureSlowDivide64, + FeatureSlowTwoMemOps, + FeatureLEAUsesAG, + FeaturePadShortFunctions, + FeatureLAHFSAHF +]>; +def : BonnellProc<"bonnell">; +def : BonnellProc<"atom">; // Pin the generic name to the baseline. + +class SilvermontProc<string Name> : ProcessorModel<Name, SLMModel, [ + ProcIntelSLM, + FeatureX87, + FeatureMMX, + FeatureSSE42, + FeatureFXSR, + FeatureNOPL, + FeatureCMPXCHG16B, + FeatureMOVBE, + FeaturePOPCNT, + FeaturePCLMUL, + FeatureAES, + FeatureSlowDivide64, + FeatureSlowTwoMemOps, + FeaturePRFCHW, + FeatureSlowLEA, + FeatureSlowIncDec, + FeatureSlowPMULLD, + FeatureRDRAND, + FeatureLAHFSAHF, + FeaturePOPCNTFalseDeps +]>; +def : SilvermontProc<"silvermont">; +def : SilvermontProc<"slm">; // Legacy alias. + +class ProcessorFeatures<list<SubtargetFeature> Inherited, + list<SubtargetFeature> NewFeatures> { + list<SubtargetFeature> Value = !listconcat(Inherited, NewFeatures); +} + +class ProcModel<string Name, SchedMachineModel Model, + list<SubtargetFeature> ProcFeatures, + list<SubtargetFeature> OtherFeatures> : + ProcessorModel<Name, Model, !listconcat(ProcFeatures, OtherFeatures)>; + +def GLMFeatures : ProcessorFeatures<[], [ + FeatureX87, + FeatureMMX, + FeatureSSE42, + FeatureFXSR, + FeatureNOPL, + FeatureCMPXCHG16B, + FeatureMOVBE, + FeaturePOPCNT, + FeaturePCLMUL, + FeatureAES, + FeaturePRFCHW, + FeatureSlowTwoMemOps, + FeatureSlowLEA, + FeatureSlowIncDec, + FeatureLAHFSAHF, + FeatureMPX, + FeatureSHA, + FeatureRDRAND, + FeatureRDSEED, + FeatureXSAVE, + FeatureXSAVEOPT, + FeatureXSAVEC, + FeatureXSAVES, + FeatureCLFLUSHOPT, + FeatureFSGSBase +]>; + +class GoldmontProc<string Name> : ProcModel<Name, SLMModel, + GLMFeatures.Value, [ + ProcIntelGLM, + FeaturePOPCNTFalseDeps +]>; +def : GoldmontProc<"goldmont">; + +def GLPFeatures : ProcessorFeatures<GLMFeatures.Value, [ + FeaturePTWRITE, + FeatureRDPID, + FeatureSGX +]>; + +class GoldmontPlusProc<string Name> : ProcModel<Name, SLMModel, + GLPFeatures.Value, [ + ProcIntelGLP +]>; +def : GoldmontPlusProc<"goldmont-plus">; + +class TremontProc<string Name> : ProcModel<Name, SLMModel, + GLPFeatures.Value, [ + ProcIntelTRM, + FeatureCLDEMOTE, + FeatureGFNI, + FeatureMOVDIRI, + FeatureMOVDIR64B, + FeatureWAITPKG +]>; +def : TremontProc<"tremont">; + +// "Arrandale" along with corei3 and corei5 +class NehalemProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [ + FeatureX87, + FeatureMMX, + FeatureSSE42, + FeatureFXSR, + FeatureNOPL, + FeatureCMPXCHG16B, + FeaturePOPCNT, + FeatureLAHFSAHF, + FeatureMacroFusion +]>; +def : NehalemProc<"nehalem">; +def : NehalemProc<"corei7">; + +// Westmere is a similar machine to nehalem with some additional features. +// Westmere is the corei3/i5/i7 path from nehalem to sandybridge +class WestmereProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [ + FeatureX87, + FeatureMMX, + FeatureSSE42, + FeatureFXSR, + FeatureNOPL, + FeatureCMPXCHG16B, + FeaturePOPCNT, + FeatureAES, + FeaturePCLMUL, + FeatureLAHFSAHF, + FeatureMacroFusion +]>; +def : WestmereProc<"westmere">; + +// SSE is not listed here since llvm treats AVX as a reimplementation of SSE, +// rather than a superset. +def SNBFeatures : ProcessorFeatures<[], [ + FeatureX87, + FeatureMMX, + FeatureAVX, + FeatureFXSR, + FeatureNOPL, + FeatureCMPXCHG16B, + FeaturePOPCNT, + FeatureAES, + FeatureSlowDivide64, + FeaturePCLMUL, + FeatureXSAVE, + FeatureXSAVEOPT, + FeatureLAHFSAHF, + FeatureSlow3OpsLEA, + FeatureFastScalarFSQRT, + FeatureFastSHLDRotate, + FeatureSlowIncDec, + FeatureMacroFusion +]>; + +class SandyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel, + SNBFeatures.Value, [ + FeatureSlowUAMem32, + FeaturePOPCNTFalseDeps +]>; +def : SandyBridgeProc<"sandybridge">; +def : SandyBridgeProc<"corei7-avx">; // Legacy alias. + +def IVBFeatures : ProcessorFeatures<SNBFeatures.Value, [ + FeatureRDRAND, + FeatureF16C, + FeatureFSGSBase +]>; + +class IvyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel, + IVBFeatures.Value, [ + FeatureSlowUAMem32, + FeaturePOPCNTFalseDeps +]>; +def : IvyBridgeProc<"ivybridge">; +def : IvyBridgeProc<"core-avx-i">; // Legacy alias. + +def HSWFeatures : ProcessorFeatures<IVBFeatures.Value, [ + FeatureAVX2, + FeatureBMI, + FeatureBMI2, + FeatureERMSB, + FeatureFMA, + FeatureINVPCID, + FeatureLZCNT, + FeatureMOVBE, + FeatureFastVariableShuffle +]>; + +class HaswellProc<string Name> : ProcModel<Name, HaswellModel, + HSWFeatures.Value, [ + ProcIntelHSW, + FeaturePOPCNTFalseDeps, + FeatureLZCNTFalseDeps +]>; +def : HaswellProc<"haswell">; +def : HaswellProc<"core-avx2">; // Legacy alias. + +def BDWFeatures : ProcessorFeatures<HSWFeatures.Value, [ + FeatureADX, + FeatureRDSEED, + FeaturePRFCHW +]>; +class BroadwellProc<string Name> : ProcModel<Name, BroadwellModel, + BDWFeatures.Value, [ + ProcIntelBDW, + FeaturePOPCNTFalseDeps, + FeatureLZCNTFalseDeps +]>; +def : BroadwellProc<"broadwell">; + +def SKLFeatures : ProcessorFeatures<BDWFeatures.Value, [ + FeatureMPX, + FeatureRTM, + FeatureXSAVEC, + FeatureXSAVES, + FeatureCLFLUSHOPT, + FeatureFastVectorFSQRT +]>; + +class SkylakeClientProc<string Name> : ProcModel<Name, SkylakeClientModel, + SKLFeatures.Value, [ + ProcIntelSKL, + FeatureHasFastGather, + FeaturePOPCNTFalseDeps, + FeatureSGX +]>; +def : SkylakeClientProc<"skylake">; + +def KNLFeatures : ProcessorFeatures<IVBFeatures.Value, [ + FeatureAVX512, + FeatureERI, + FeatureCDI, + FeaturePFI, + FeaturePREFETCHWT1, + FeatureADX, + FeatureRDSEED, + FeatureMOVBE, + FeatureLZCNT, + FeatureBMI, + FeatureBMI2, + FeatureFMA, + FeaturePRFCHW +]>; + +// FIXME: define KNL model +class KnightsLandingProc<string Name> : ProcModel<Name, HaswellModel, + KNLFeatures.Value, [ + ProcIntelKNL, + FeatureSlowTwoMemOps, + FeatureFastPartialYMMorZMMWrite, + FeatureHasFastGather +]>; +def : KnightsLandingProc<"knl">; + +class KnightsMillProc<string Name> : ProcModel<Name, HaswellModel, + KNLFeatures.Value, [ + ProcIntelKNL, + FeatureSlowTwoMemOps, + FeatureFastPartialYMMorZMMWrite, + FeatureHasFastGather, + FeatureVPOPCNTDQ +]>; +def : KnightsMillProc<"knm">; // TODO Add AVX5124FMAPS/AVX5124VNNIW features + +def SKXFeatures : ProcessorFeatures<SKLFeatures.Value, [ + FeatureAVX512, + FeatureCDI, + FeatureDQI, + FeatureBWI, + FeatureVLX, + FeaturePKU, + FeatureCLWB +]>; + +class SkylakeServerProc<string Name> : ProcModel<Name, SkylakeServerModel, + SKXFeatures.Value, [ + ProcIntelSKX, + FeatureHasFastGather, + FeaturePOPCNTFalseDeps +]>; +def : SkylakeServerProc<"skylake-avx512">; +def : SkylakeServerProc<"skx">; // Legacy alias. + +def CNLFeatures : ProcessorFeatures<SKLFeatures.Value, [ + FeatureAVX512, + FeatureCDI, + FeatureDQI, + FeatureBWI, + FeatureVLX, + FeaturePKU, + FeatureVBMI, + FeatureIFMA, + FeatureSHA, + FeatureSGX +]>; + +class CannonlakeProc<string Name> : ProcModel<Name, SkylakeServerModel, + CNLFeatures.Value, [ + ProcIntelCNL, + FeatureHasFastGather +]>; +def : CannonlakeProc<"cannonlake">; + +def ICLFeatures : ProcessorFeatures<CNLFeatures.Value, [ + FeatureBITALG, + FeatureVAES, + FeatureVBMI2, + FeatureVNNI, + FeatureVPCLMULQDQ, + FeatureVPOPCNTDQ, + FeatureGFNI, + FeatureCLWB, + FeatureRDPID +]>; + +class IcelakeClientProc<string Name> : ProcModel<Name, SkylakeServerModel, + ICLFeatures.Value, [ + ProcIntelICL, + FeatureHasFastGather +]>; +def : IcelakeClientProc<"icelake-client">; + +class IcelakeServerProc<string Name> : ProcModel<Name, SkylakeServerModel, + ICLFeatures.Value, [ + ProcIntelICX, + FeaturePCONFIG, + FeatureWBNOINVD, + FeatureHasFastGather +]>; +def : IcelakeServerProc<"icelake-server">; + +// AMD CPUs. + +def : Proc<"k6", [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>; +def : Proc<"k6-2", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>; +def : Proc<"k6-3", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>; + +foreach P = ["athlon", "athlon-tbird"] in { + def : Proc<P, [FeatureX87, FeatureSlowUAMem16, Feature3DNowA, + FeatureNOPL, FeatureSlowSHLD]>; +} + +foreach P = ["athlon-4", "athlon-xp", "athlon-mp"] in { + def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureSSE1, + Feature3DNowA, FeatureFXSR, FeatureNOPL, FeatureSlowSHLD]>; +} + +foreach P = ["k8", "opteron", "athlon64", "athlon-fx"] in { + def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA, + FeatureFXSR, FeatureNOPL, Feature64Bit, FeatureSlowSHLD]>; +} + +foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in { + def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureSSE3, Feature3DNowA, + FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B, FeatureSlowSHLD]>; +} + +foreach P = ["amdfam10", "barcelona"] in { + def : Proc<P, [FeatureX87, FeatureSSE4A, Feature3DNowA, FeatureFXSR, + FeatureNOPL, FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT, + FeatureSlowSHLD, FeatureLAHFSAHF]>; +} + +// Bobcat +def : Proc<"btver1", [ + FeatureX87, + FeatureMMX, + FeatureSSSE3, + FeatureSSE4A, + FeatureFXSR, + FeatureNOPL, + FeatureCMPXCHG16B, + FeaturePRFCHW, + FeatureLZCNT, + FeaturePOPCNT, + FeatureSlowSHLD, + FeatureLAHFSAHF, + FeatureFast15ByteNOP +]>; + +// Jaguar +def : ProcessorModel<"btver2", BtVer2Model, [ + FeatureX87, + FeatureMMX, + FeatureAVX, + FeatureFXSR, + FeatureNOPL, + FeatureSSE4A, + FeatureCMPXCHG16B, + FeaturePRFCHW, + FeatureAES, + FeaturePCLMUL, + FeatureBMI, + FeatureF16C, + FeatureMOVBE, + FeatureLZCNT, + FeatureFastLZCNT, + FeaturePOPCNT, + FeatureXSAVE, + FeatureXSAVEOPT, + FeatureSlowSHLD, + FeatureLAHFSAHF, + FeatureFast15ByteNOP, + FeatureFastPartialYMMorZMMWrite +]>; + +// Bulldozer +def : Proc<"bdver1", [ + FeatureX87, + FeatureXOP, + FeatureFMA4, + FeatureCMPXCHG16B, + FeatureAES, + FeaturePRFCHW, + FeaturePCLMUL, + FeatureMMX, + FeatureAVX, + FeatureFXSR, + FeatureNOPL, + FeatureSSE4A, + FeatureLZCNT, + FeaturePOPCNT, + FeatureXSAVE, + FeatureLWP, + FeatureSlowSHLD, + FeatureLAHFSAHF, + FeatureFast11ByteNOP, + FeatureMacroFusion +]>; +// Piledriver +def : Proc<"bdver2", [ + FeatureX87, + FeatureXOP, + FeatureFMA4, + FeatureCMPXCHG16B, + FeatureAES, + FeaturePRFCHW, + FeaturePCLMUL, + FeatureMMX, + FeatureAVX, + FeatureFXSR, + FeatureNOPL, + FeatureSSE4A, + FeatureF16C, + FeatureLZCNT, + FeaturePOPCNT, + FeatureXSAVE, + FeatureBMI, + FeatureTBM, + FeatureLWP, + FeatureFMA, + FeatureSlowSHLD, + FeatureLAHFSAHF, + FeatureFast11ByteNOP, + FeatureMacroFusion +]>; + +// Steamroller +def : Proc<"bdver3", [ + FeatureX87, + FeatureXOP, + FeatureFMA4, + FeatureCMPXCHG16B, + FeatureAES, + FeaturePRFCHW, + FeaturePCLMUL, + FeatureMMX, + FeatureAVX, + FeatureFXSR, + FeatureNOPL, + FeatureSSE4A, + FeatureF16C, + FeatureLZCNT, + FeaturePOPCNT, + FeatureXSAVE, + FeatureBMI, + FeatureTBM, + FeatureLWP, + FeatureFMA, + FeatureXSAVEOPT, + FeatureSlowSHLD, + FeatureFSGSBase, + FeatureLAHFSAHF, + FeatureFast11ByteNOP, + FeatureMacroFusion +]>; + +// Excavator +def : Proc<"bdver4", [ + FeatureX87, + FeatureMMX, + FeatureAVX2, + FeatureFXSR, + FeatureNOPL, + FeatureXOP, + FeatureFMA4, + FeatureCMPXCHG16B, + FeatureAES, + FeaturePRFCHW, + FeaturePCLMUL, + FeatureF16C, + FeatureLZCNT, + FeaturePOPCNT, + FeatureXSAVE, + FeatureBMI, + FeatureBMI2, + FeatureTBM, + FeatureLWP, + FeatureFMA, + FeatureXSAVEOPT, + FeatureSlowSHLD, + FeatureFSGSBase, + FeatureLAHFSAHF, + FeatureFast11ByteNOP, + FeatureMWAITX, + FeatureMacroFusion +]>; + +// Znver1 +def: ProcessorModel<"znver1", Znver1Model, [ + FeatureADX, + FeatureAES, + FeatureAVX2, + FeatureBMI, + FeatureBMI2, + FeatureCLFLUSHOPT, + FeatureCLZERO, + FeatureCMPXCHG16B, + FeatureF16C, + FeatureFMA, + FeatureFSGSBase, + FeatureFXSR, + FeatureNOPL, + FeatureFastLZCNT, + FeatureLAHFSAHF, + FeatureLZCNT, + FeatureFast15ByteNOP, + FeatureMacroFusion, + FeatureMMX, + FeatureMOVBE, + FeatureMWAITX, + FeaturePCLMUL, + FeaturePOPCNT, + FeaturePRFCHW, + FeatureRDRAND, + FeatureRDSEED, + FeatureSHA, + FeatureSSE4A, + FeatureSlowSHLD, + FeatureX87, + FeatureXSAVE, + FeatureXSAVEC, + FeatureXSAVEOPT, + FeatureXSAVES]>; + +def : Proc<"geode", [FeatureX87, FeatureSlowUAMem16, Feature3DNowA]>; + +def : Proc<"winchip-c6", [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>; +def : Proc<"winchip2", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>; +def : Proc<"c3", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>; +def : Proc<"c3-2", [FeatureX87, FeatureSlowUAMem16, FeatureMMX, + FeatureSSE1, FeatureFXSR]>; + +// We also provide a generic 64-bit specific x86 processor model which tries to +// be good for modern chips without enabling instruction set encodings past the +// basic SSE2 and 64-bit ones. It disables slow things from any mainstream and +// modern 64-bit x86 chip, and enables features that are generally beneficial. +// +// We currently use the Sandy Bridge model as the default scheduling model as +// we use it across Nehalem, Westmere, Sandy Bridge, and Ivy Bridge which +// covers a huge swath of x86 processors. If there are specific scheduling +// knobs which need to be tuned differently for AMD chips, we might consider +// forming a common base for them. +def : ProcessorModel<"x86-64", SandyBridgeModel, [ + FeatureX87, + FeatureMMX, + FeatureSSE2, + FeatureFXSR, + FeatureNOPL, + Feature64Bit, + FeatureSlow3OpsLEA, + FeatureSlowIncDec, + FeatureMacroFusion +]>; + +//===----------------------------------------------------------------------===// +// Calling Conventions +//===----------------------------------------------------------------------===// + +include "X86CallingConv.td" + + +//===----------------------------------------------------------------------===// +// Assembly Parser +//===----------------------------------------------------------------------===// + +def ATTAsmParserVariant : AsmParserVariant { + int Variant = 0; + + // Variant name. + string Name = "att"; + + // Discard comments in assembly strings. + string CommentDelimiter = "#"; + + // Recognize hard coded registers. + string RegisterPrefix = "%"; +} + +def IntelAsmParserVariant : AsmParserVariant { + int Variant = 1; + + // Variant name. + string Name = "intel"; + + // Discard comments in assembly strings. + string CommentDelimiter = ";"; + + // Recognize hard coded registers. + string RegisterPrefix = ""; +} + +//===----------------------------------------------------------------------===// +// Assembly Printers +//===----------------------------------------------------------------------===// + +// The X86 target supports two different syntaxes for emitting machine code. +// This is controlled by the -x86-asm-syntax={att|intel} +def ATTAsmWriter : AsmWriter { + string AsmWriterClassName = "ATTInstPrinter"; + int Variant = 0; +} +def IntelAsmWriter : AsmWriter { + string AsmWriterClassName = "IntelInstPrinter"; + int Variant = 1; +} + +def X86 : Target { + // Information about the instructions... + let InstructionSet = X86InstrInfo; + let AssemblyParserVariants = [ATTAsmParserVariant, IntelAsmParserVariant]; + let AssemblyWriters = [ATTAsmWriter, IntelAsmWriter]; + let AllowRegisterRenaming = 1; +} + +//===----------------------------------------------------------------------===// +// Pfm Counters +//===----------------------------------------------------------------------===// + +include "X86PfmCounters.td" diff --git a/capstone/suite/synctools/tablegen/X86/back/X86CallingConv.td b/capstone/suite/synctools/tablegen/X86/back/X86CallingConv.td new file mode 100644 index 000000000..fcc9a296d --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86CallingConv.td @@ -0,0 +1,1150 @@ +//===-- X86CallingConv.td - Calling Conventions X86 32/64 --*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This describes the calling conventions for the X86-32 and X86-64 +// architectures. +// +//===----------------------------------------------------------------------===// + +/// CCIfSubtarget - Match if the current subtarget has a feature F. +class CCIfSubtarget<string F, CCAction A> + : CCIf<!strconcat("static_cast<const X86Subtarget&>" + "(State.getMachineFunction().getSubtarget()).", F), + A>; + +/// CCIfNotSubtarget - Match if the current subtarget doesn't has a feature F. +class CCIfNotSubtarget<string F, CCAction A> + : CCIf<!strconcat("!static_cast<const X86Subtarget&>" + "(State.getMachineFunction().getSubtarget()).", F), + A>; + +// Register classes for RegCall +class RC_X86_RegCall { + list<Register> GPR_8 = []; + list<Register> GPR_16 = []; + list<Register> GPR_32 = []; + list<Register> GPR_64 = []; + list<Register> FP_CALL = [FP0]; + list<Register> FP_RET = [FP0, FP1]; + list<Register> XMM = []; + list<Register> YMM = []; + list<Register> ZMM = []; +} + +// RegCall register classes for 32 bits +def RC_X86_32_RegCall : RC_X86_RegCall { + let GPR_8 = [AL, CL, DL, DIL, SIL]; + let GPR_16 = [AX, CX, DX, DI, SI]; + let GPR_32 = [EAX, ECX, EDX, EDI, ESI]; + let GPR_64 = [RAX]; ///< Not actually used, but AssignToReg can't handle [] + ///< \todo Fix AssignToReg to enable empty lists + let XMM = [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]; + let YMM = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7]; + let ZMM = [ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5, ZMM6, ZMM7]; +} + +class RC_X86_64_RegCall : RC_X86_RegCall { + let XMM = [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, + XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15]; + let YMM = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7, + YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15]; + let ZMM = [ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5, ZMM6, ZMM7, + ZMM8, ZMM9, ZMM10, ZMM11, ZMM12, ZMM13, ZMM14, ZMM15]; +} + +def RC_X86_64_RegCall_Win : RC_X86_64_RegCall { + let GPR_8 = [AL, CL, DL, DIL, SIL, R8B, R9B, R10B, R11B, R12B, R14B, R15B]; + let GPR_16 = [AX, CX, DX, DI, SI, R8W, R9W, R10W, R11W, R12W, R14W, R15W]; + let GPR_32 = [EAX, ECX, EDX, EDI, ESI, R8D, R9D, R10D, R11D, R12D, R14D, R15D]; + let GPR_64 = [RAX, RCX, RDX, RDI, RSI, R8, R9, R10, R11, R12, R14, R15]; +} + +def RC_X86_64_RegCall_SysV : RC_X86_64_RegCall { + let GPR_8 = [AL, CL, DL, DIL, SIL, R8B, R9B, R12B, R13B, R14B, R15B]; + let GPR_16 = [AX, CX, DX, DI, SI, R8W, R9W, R12W, R13W, R14W, R15W]; + let GPR_32 = [EAX, ECX, EDX, EDI, ESI, R8D, R9D, R12D, R13D, R14D, R15D]; + let GPR_64 = [RAX, RCX, RDX, RDI, RSI, R8, R9, R12, R13, R14, R15]; +} + +// X86-64 Intel regcall calling convention. +multiclass X86_RegCall_base<RC_X86_RegCall RC> { +def CC_#NAME : CallingConv<[ + // Handles byval parameters. + CCIfSubtarget<"is64Bit()", CCIfByVal<CCPassByVal<8, 8>>>, + CCIfByVal<CCPassByVal<4, 4>>, + + // Promote i1/i8/i16/v1i1 arguments to i32. + CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>, + + // Promote v8i1/v16i1/v32i1 arguments to i32. + CCIfType<[v8i1, v16i1, v32i1], CCPromoteToType<i32>>, + + // bool, char, int, enum, long, pointer --> GPR + CCIfType<[i32], CCAssignToReg<RC.GPR_32>>, + + // long long, __int64 --> GPR + CCIfType<[i64], CCAssignToReg<RC.GPR_64>>, + + // __mmask64 (v64i1) --> GPR64 (for x64) or 2 x GPR32 (for IA32) + CCIfType<[v64i1], CCPromoteToType<i64>>, + CCIfSubtarget<"is64Bit()", CCIfType<[i64], + CCAssignToReg<RC.GPR_64>>>, + CCIfSubtarget<"is32Bit()", CCIfType<[i64], + CCCustom<"CC_X86_32_RegCall_Assign2Regs">>>, + + // float, double, float128 --> XMM + // In the case of SSE disabled --> save to stack + CCIfType<[f32, f64, f128], + CCIfSubtarget<"hasSSE1()", CCAssignToReg<RC.XMM>>>, + + // long double --> FP + CCIfType<[f80], CCAssignToReg<RC.FP_CALL>>, + + // __m128, __m128i, __m128d --> XMM + // In the case of SSE disabled --> save to stack + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCIfSubtarget<"hasSSE1()", CCAssignToReg<RC.XMM>>>, + + // __m256, __m256i, __m256d --> YMM + // In the case of SSE disabled --> save to stack + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCIfSubtarget<"hasAVX()", CCAssignToReg<RC.YMM>>>, + + // __m512, __m512i, __m512d --> ZMM + // In the case of SSE disabled --> save to stack + CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCIfSubtarget<"hasAVX512()",CCAssignToReg<RC.ZMM>>>, + + // If no register was found -> assign to stack + + // In 64 bit, assign 64/32 bit values to 8 byte stack + CCIfSubtarget<"is64Bit()", CCIfType<[i32, i64, f32, f64], + CCAssignToStack<8, 8>>>, + + // In 32 bit, assign 64/32 bit values to 8/4 byte stack + CCIfType<[i32, f32], CCAssignToStack<4, 4>>, + CCIfType<[i64, f64], CCAssignToStack<8, 4>>, + + // MMX type gets 8 byte slot in stack , while alignment depends on target + CCIfSubtarget<"is64Bit()", CCIfType<[x86mmx], CCAssignToStack<8, 8>>>, + CCIfType<[x86mmx], CCAssignToStack<8, 4>>, + + // float 128 get stack slots whose size and alignment depends + // on the subtarget. + CCIfType<[f80, f128], CCAssignToStack<0, 0>>, + + // Vectors get 16-byte stack slots that are 16-byte aligned. + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCAssignToStack<16, 16>>, + + // 256-bit vectors get 32-byte stack slots that are 32-byte aligned. + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCAssignToStack<32, 32>>, + + // 512-bit vectors get 64-byte stack slots that are 64-byte aligned. + CCIfType<[v16i32, v8i64, v16f32, v8f64], CCAssignToStack<64, 64>> +]>; + +def RetCC_#NAME : CallingConv<[ + // Promote i1, v1i1, v8i1 arguments to i8. + CCIfType<[i1, v1i1, v8i1], CCPromoteToType<i8>>, + + // Promote v16i1 arguments to i16. + CCIfType<[v16i1], CCPromoteToType<i16>>, + + // Promote v32i1 arguments to i32. + CCIfType<[v32i1], CCPromoteToType<i32>>, + + // bool, char, int, enum, long, pointer --> GPR + CCIfType<[i8], CCAssignToReg<RC.GPR_8>>, + CCIfType<[i16], CCAssignToReg<RC.GPR_16>>, + CCIfType<[i32], CCAssignToReg<RC.GPR_32>>, + + // long long, __int64 --> GPR + CCIfType<[i64], CCAssignToReg<RC.GPR_64>>, + + // __mmask64 (v64i1) --> GPR64 (for x64) or 2 x GPR32 (for IA32) + CCIfType<[v64i1], CCPromoteToType<i64>>, + CCIfSubtarget<"is64Bit()", CCIfType<[i64], + CCAssignToReg<RC.GPR_64>>>, + CCIfSubtarget<"is32Bit()", CCIfType<[i64], + CCCustom<"CC_X86_32_RegCall_Assign2Regs">>>, + + // long double --> FP + CCIfType<[f80], CCAssignToReg<RC.FP_RET>>, + + // float, double, float128 --> XMM + CCIfType<[f32, f64, f128], + CCIfSubtarget<"hasSSE1()", CCAssignToReg<RC.XMM>>>, + + // __m128, __m128i, __m128d --> XMM + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCIfSubtarget<"hasSSE1()", CCAssignToReg<RC.XMM>>>, + + // __m256, __m256i, __m256d --> YMM + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCIfSubtarget<"hasAVX()", CCAssignToReg<RC.YMM>>>, + + // __m512, __m512i, __m512d --> ZMM + CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCIfSubtarget<"hasAVX512()", CCAssignToReg<RC.ZMM>>> +]>; +} + +//===----------------------------------------------------------------------===// +// Return Value Calling Conventions +//===----------------------------------------------------------------------===// + +// Return-value conventions common to all X86 CC's. +def RetCC_X86Common : CallingConv<[ + // Scalar values are returned in AX first, then DX. For i8, the ABI + // requires the values to be in AL and AH, however this code uses AL and DL + // instead. This is because using AH for the second register conflicts with + // the way LLVM does multiple return values -- a return of {i16,i8} would end + // up in AX and AH, which overlap. Front-ends wishing to conform to the ABI + // for functions that return two i8 values are currently expected to pack the + // values into an i16 (which uses AX, and thus AL:AH). + // + // For code that doesn't care about the ABI, we allow returning more than two + // integer values in registers. + CCIfType<[v1i1], CCPromoteToType<i8>>, + CCIfType<[i1], CCPromoteToType<i8>>, + CCIfType<[i8] , CCAssignToReg<[AL, DL, CL]>>, + CCIfType<[i16], CCAssignToReg<[AX, DX, CX]>>, + CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>, + CCIfType<[i64], CCAssignToReg<[RAX, RDX, RCX]>>, + + // Boolean vectors of AVX-512 are returned in SIMD registers. + // The call from AVX to AVX-512 function should work, + // since the boolean types in AVX/AVX2 are promoted by default. + CCIfType<[v2i1], CCPromoteToType<v2i64>>, + CCIfType<[v4i1], CCPromoteToType<v4i32>>, + CCIfType<[v8i1], CCPromoteToType<v8i16>>, + CCIfType<[v16i1], CCPromoteToType<v16i8>>, + CCIfType<[v32i1], CCPromoteToType<v32i8>>, + CCIfType<[v64i1], CCPromoteToType<v64i8>>, + + // Vector types are returned in XMM0 and XMM1, when they fit. XMM2 and XMM3 + // can only be used by ABI non-compliant code. If the target doesn't have XMM + // registers, it won't have vector types. + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>, + + // 256-bit vectors are returned in YMM0 and XMM1, when they fit. YMM2 and YMM3 + // can only be used by ABI non-compliant code. This vector type is only + // supported while using the AVX target feature. + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>, + + // 512-bit vectors are returned in ZMM0 and ZMM1, when they fit. ZMM2 and ZMM3 + // can only be used by ABI non-compliant code. This vector type is only + // supported while using the AVX-512 target feature. + CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>, + + // MMX vector types are always returned in MM0. If the target doesn't have + // MM0, it doesn't support these vector types. + CCIfType<[x86mmx], CCAssignToReg<[MM0]>>, + + // Long double types are always returned in FP0 (even with SSE), + // except on Win64. + CCIfNotSubtarget<"isTargetWin64()", CCIfType<[f80], CCAssignToReg<[FP0, FP1]>>> +]>; + +// X86-32 C return-value convention. +def RetCC_X86_32_C : CallingConv<[ + // The X86-32 calling convention returns FP values in FP0, unless marked + // with "inreg" (used here to distinguish one kind of reg from another, + // weirdly; this is really the sse-regparm calling convention) in which + // case they use XMM0, otherwise it is the same as the common X86 calling + // conv. + CCIfInReg<CCIfSubtarget<"hasSSE2()", + CCIfType<[f32, f64], CCAssignToReg<[XMM0,XMM1,XMM2]>>>>, + CCIfType<[f32,f64], CCAssignToReg<[FP0, FP1]>>, + CCDelegateTo<RetCC_X86Common> +]>; + +// X86-32 FastCC return-value convention. +def RetCC_X86_32_Fast : CallingConv<[ + // The X86-32 fastcc returns 1, 2, or 3 FP values in XMM0-2 if the target has + // SSE2. + // This can happen when a float, 2 x float, or 3 x float vector is split by + // target lowering, and is returned in 1-3 sse regs. + CCIfType<[f32], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>, + CCIfType<[f64], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>, + + // For integers, ECX can be used as an extra return register + CCIfType<[i8], CCAssignToReg<[AL, DL, CL]>>, + CCIfType<[i16], CCAssignToReg<[AX, DX, CX]>>, + CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>, + + // Otherwise, it is the same as the common X86 calling convention. + CCDelegateTo<RetCC_X86Common> +]>; + +// Intel_OCL_BI return-value convention. +def RetCC_Intel_OCL_BI : CallingConv<[ + // Vector types are returned in XMM0,XMM1,XMMM2 and XMM3. + CCIfType<[f32, f64, v4i32, v2i64, v4f32, v2f64], + CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>, + + // 256-bit FP vectors + // No more than 4 registers + CCIfType<[v8f32, v4f64, v8i32, v4i64], + CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>, + + // 512-bit FP vectors + CCIfType<[v16f32, v8f64, v16i32, v8i64], + CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>, + + // i32, i64 in the standard way + CCDelegateTo<RetCC_X86Common> +]>; + +// X86-32 HiPE return-value convention. +def RetCC_X86_32_HiPE : CallingConv<[ + // Promote all types to i32 + CCIfType<[i8, i16], CCPromoteToType<i32>>, + + // Return: HP, P, VAL1, VAL2 + CCIfType<[i32], CCAssignToReg<[ESI, EBP, EAX, EDX]>> +]>; + +// X86-32 Vectorcall return-value convention. +def RetCC_X86_32_VectorCall : CallingConv<[ + // Floating Point types are returned in XMM0,XMM1,XMMM2 and XMM3. + CCIfType<[f32, f64, f128], + CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>, + + // Return integers in the standard way. + CCDelegateTo<RetCC_X86Common> +]>; + +// X86-64 C return-value convention. +def RetCC_X86_64_C : CallingConv<[ + // The X86-64 calling convention always returns FP values in XMM0. + CCIfType<[f32], CCAssignToReg<[XMM0, XMM1]>>, + CCIfType<[f64], CCAssignToReg<[XMM0, XMM1]>>, + CCIfType<[f128], CCAssignToReg<[XMM0, XMM1]>>, + + // MMX vector types are always returned in XMM0. + CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1]>>, + + CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[R12]>>>, + + CCDelegateTo<RetCC_X86Common> +]>; + +// X86-Win64 C return-value convention. +def RetCC_X86_Win64_C : CallingConv<[ + // The X86-Win64 calling convention always returns __m64 values in RAX. + CCIfType<[x86mmx], CCBitConvertToType<i64>>, + + // Otherwise, everything is the same as 'normal' X86-64 C CC. + CCDelegateTo<RetCC_X86_64_C> +]>; + +// X86-64 vectorcall return-value convention. +def RetCC_X86_64_Vectorcall : CallingConv<[ + // Vectorcall calling convention always returns FP values in XMMs. + CCIfType<[f32, f64, f128], + CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>, + + // Otherwise, everything is the same as Windows X86-64 C CC. + CCDelegateTo<RetCC_X86_Win64_C> +]>; + +// X86-64 HiPE return-value convention. +def RetCC_X86_64_HiPE : CallingConv<[ + // Promote all types to i64 + CCIfType<[i8, i16, i32], CCPromoteToType<i64>>, + + // Return: HP, P, VAL1, VAL2 + CCIfType<[i64], CCAssignToReg<[R15, RBP, RAX, RDX]>> +]>; + +// X86-64 WebKit_JS return-value convention. +def RetCC_X86_64_WebKit_JS : CallingConv<[ + // Promote all types to i64 + CCIfType<[i8, i16, i32], CCPromoteToType<i64>>, + + // Return: RAX + CCIfType<[i64], CCAssignToReg<[RAX]>> +]>; + +def RetCC_X86_64_Swift : CallingConv<[ + + CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[R12]>>>, + + // For integers, ECX, R8D can be used as extra return registers. + CCIfType<[v1i1], CCPromoteToType<i8>>, + CCIfType<[i1], CCPromoteToType<i8>>, + CCIfType<[i8] , CCAssignToReg<[AL, DL, CL, R8B]>>, + CCIfType<[i16], CCAssignToReg<[AX, DX, CX, R8W]>>, + CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX, R8D]>>, + CCIfType<[i64], CCAssignToReg<[RAX, RDX, RCX, R8]>>, + + // XMM0, XMM1, XMM2 and XMM3 can be used to return FP values. + CCIfType<[f32], CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>, + CCIfType<[f64], CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>, + CCIfType<[f128], CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>, + + // MMX vector types are returned in XMM0, XMM1, XMM2 and XMM3. + CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>, + CCDelegateTo<RetCC_X86Common> +]>; + +// X86-64 AnyReg return-value convention. No explicit register is specified for +// the return-value. The register allocator is allowed and expected to choose +// any free register. +// +// This calling convention is currently only supported by the stackmap and +// patchpoint intrinsics. All other uses will result in an assert on Debug +// builds. On Release builds we fallback to the X86 C calling convention. +def RetCC_X86_64_AnyReg : CallingConv<[ + CCCustom<"CC_X86_AnyReg_Error"> +]>; + +// X86-64 HHVM return-value convention. +def RetCC_X86_64_HHVM: CallingConv<[ + // Promote all types to i64 + CCIfType<[i8, i16, i32], CCPromoteToType<i64>>, + + // Return: could return in any GP register save RSP and R12. + CCIfType<[i64], CCAssignToReg<[RBX, RBP, RDI, RSI, RDX, RCX, R8, R9, + RAX, R10, R11, R13, R14, R15]>> +]>; + + +defm X86_32_RegCall : + X86_RegCall_base<RC_X86_32_RegCall>; +defm X86_Win64_RegCall : + X86_RegCall_base<RC_X86_64_RegCall_Win>; +defm X86_SysV64_RegCall : + X86_RegCall_base<RC_X86_64_RegCall_SysV>; + +// This is the root return-value convention for the X86-32 backend. +def RetCC_X86_32 : CallingConv<[ + // If FastCC, use RetCC_X86_32_Fast. + CCIfCC<"CallingConv::Fast", CCDelegateTo<RetCC_X86_32_Fast>>, + // If HiPE, use RetCC_X86_32_HiPE. + CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_32_HiPE>>, + CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<RetCC_X86_32_VectorCall>>, + CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo<RetCC_X86_32_RegCall>>, + + // Otherwise, use RetCC_X86_32_C. + CCDelegateTo<RetCC_X86_32_C> +]>; + +// This is the root return-value convention for the X86-64 backend. +def RetCC_X86_64 : CallingConv<[ + // HiPE uses RetCC_X86_64_HiPE + CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_64_HiPE>>, + + // Handle JavaScript calls. + CCIfCC<"CallingConv::WebKit_JS", CCDelegateTo<RetCC_X86_64_WebKit_JS>>, + CCIfCC<"CallingConv::AnyReg", CCDelegateTo<RetCC_X86_64_AnyReg>>, + + // Handle Swift calls. + CCIfCC<"CallingConv::Swift", CCDelegateTo<RetCC_X86_64_Swift>>, + + // Handle explicit CC selection + CCIfCC<"CallingConv::Win64", CCDelegateTo<RetCC_X86_Win64_C>>, + CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<RetCC_X86_64_C>>, + + // Handle Vectorcall CC + CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<RetCC_X86_64_Vectorcall>>, + + // Handle HHVM calls. + CCIfCC<"CallingConv::HHVM", CCDelegateTo<RetCC_X86_64_HHVM>>, + + CCIfCC<"CallingConv::X86_RegCall", + CCIfSubtarget<"isTargetWin64()", + CCDelegateTo<RetCC_X86_Win64_RegCall>>>, + CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo<RetCC_X86_SysV64_RegCall>>, + + // Mingw64 and native Win64 use Win64 CC + CCIfSubtarget<"isTargetWin64()", CCDelegateTo<RetCC_X86_Win64_C>>, + + // Otherwise, drop to normal X86-64 CC + CCDelegateTo<RetCC_X86_64_C> +]>; + +// This is the return-value convention used for the entire X86 backend. +def RetCC_X86 : CallingConv<[ + + // Check if this is the Intel OpenCL built-ins calling convention + CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<RetCC_Intel_OCL_BI>>, + + CCIfSubtarget<"is64Bit()", CCDelegateTo<RetCC_X86_64>>, + CCDelegateTo<RetCC_X86_32> +]>; + +//===----------------------------------------------------------------------===// +// X86-64 Argument Calling Conventions +//===----------------------------------------------------------------------===// + +def CC_X86_64_C : CallingConv<[ + // Handles byval parameters. + CCIfByVal<CCPassByVal<8, 8>>, + + // Promote i1/i8/i16/v1i1 arguments to i32. + CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>, + + // The 'nest' parameter, if any, is passed in R10. + CCIfNest<CCIfSubtarget<"isTarget64BitILP32()", CCAssignToReg<[R10D]>>>, + CCIfNest<CCAssignToReg<[R10]>>, + + // Pass SwiftSelf in a callee saved register. + CCIfSwiftSelf<CCIfType<[i64], CCAssignToReg<[R13]>>>, + + // A SwiftError is passed in R12. + CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[R12]>>>, + + // For Swift Calling Convention, pass sret in %rax. + CCIfCC<"CallingConv::Swift", + CCIfSRet<CCIfType<[i64], CCAssignToReg<[RAX]>>>>, + + // The first 6 integer arguments are passed in integer registers. + CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX, R8D, R9D]>>, + CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8 , R9 ]>>, + + // The first 8 MMX vector arguments are passed in XMM registers on Darwin. + CCIfType<[x86mmx], + CCIfSubtarget<"isTargetDarwin()", + CCIfSubtarget<"hasSSE2()", + CCPromoteToType<v2i64>>>>, + + // Boolean vectors of AVX-512 are passed in SIMD registers. + // The call from AVX to AVX-512 function should work, + // since the boolean types in AVX/AVX2 are promoted by default. + CCIfType<[v2i1], CCPromoteToType<v2i64>>, + CCIfType<[v4i1], CCPromoteToType<v4i32>>, + CCIfType<[v8i1], CCPromoteToType<v8i16>>, + CCIfType<[v16i1], CCPromoteToType<v16i8>>, + CCIfType<[v32i1], CCPromoteToType<v32i8>>, + CCIfType<[v64i1], CCPromoteToType<v64i8>>, + + // The first 8 FP/Vector arguments are passed in XMM registers. + CCIfType<[f32, f64, f128, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCIfSubtarget<"hasSSE1()", + CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>, + + // The first 8 256-bit vector arguments are passed in YMM registers, unless + // this is a vararg function. + // FIXME: This isn't precisely correct; the x86-64 ABI document says that + // fixed arguments to vararg functions are supposed to be passed in + // registers. Actually modeling that would be a lot of work, though. + CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCIfSubtarget<"hasAVX()", + CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, + YMM4, YMM5, YMM6, YMM7]>>>>, + + // The first 8 512-bit vector arguments are passed in ZMM registers. + CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCIfSubtarget<"hasAVX512()", + CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5, ZMM6, ZMM7]>>>>, + + // Integer/FP values get stored in stack slots that are 8 bytes in size and + // 8-byte aligned if there are no more registers to hold them. + CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>, + + // Long doubles get stack slots whose size and alignment depends on the + // subtarget. + CCIfType<[f80, f128], CCAssignToStack<0, 0>>, + + // Vectors get 16-byte stack slots that are 16-byte aligned. + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>, + + // 256-bit vectors get 32-byte stack slots that are 32-byte aligned. + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCAssignToStack<32, 32>>, + + // 512-bit vectors get 64-byte stack slots that are 64-byte aligned. + CCIfType<[v16i32, v8i64, v16f32, v8f64], + CCAssignToStack<64, 64>> +]>; + +// Calling convention for X86-64 HHVM. +def CC_X86_64_HHVM : CallingConv<[ + // Use all/any GP registers for args, except RSP. + CCIfType<[i64], CCAssignToReg<[RBX, R12, RBP, R15, + RDI, RSI, RDX, RCX, R8, R9, + RAX, R10, R11, R13, R14]>> +]>; + +// Calling convention for helper functions in HHVM. +def CC_X86_64_HHVM_C : CallingConv<[ + // Pass the first argument in RBP. + CCIfType<[i64], CCAssignToReg<[RBP]>>, + + // Otherwise it's the same as the regular C calling convention. + CCDelegateTo<CC_X86_64_C> +]>; + +// Calling convention used on Win64 +def CC_X86_Win64_C : CallingConv<[ + // FIXME: Handle byval stuff. + // FIXME: Handle varargs. + + // Promote i1/v1i1 arguments to i8. + CCIfType<[i1, v1i1], CCPromoteToType<i8>>, + + // The 'nest' parameter, if any, is passed in R10. + CCIfNest<CCAssignToReg<[R10]>>, + + // A SwiftError is passed in R12. + CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[R12]>>>, + + // 128 bit vectors are passed by pointer + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCPassIndirect<i64>>, + + + // 256 bit vectors are passed by pointer + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCPassIndirect<i64>>, + + // 512 bit vectors are passed by pointer + CCIfType<[v16i32, v16f32, v8f64, v8i64], CCPassIndirect<i64>>, + + // Long doubles are passed by pointer + CCIfType<[f80], CCPassIndirect<i64>>, + + // The first 4 MMX vector arguments are passed in GPRs. + CCIfType<[x86mmx], CCBitConvertToType<i64>>, + + // The first 4 integer arguments are passed in integer registers. + CCIfType<[i8 ], CCAssignToRegWithShadow<[CL , DL , R8B , R9B ], + [XMM0, XMM1, XMM2, XMM3]>>, + CCIfType<[i16], CCAssignToRegWithShadow<[CX , DX , R8W , R9W ], + [XMM0, XMM1, XMM2, XMM3]>>, + CCIfType<[i32], CCAssignToRegWithShadow<[ECX , EDX , R8D , R9D ], + [XMM0, XMM1, XMM2, XMM3]>>, + + // Do not pass the sret argument in RCX, the Win64 thiscall calling + // convention requires "this" to be passed in RCX. + CCIfCC<"CallingConv::X86_ThisCall", + CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[RDX , R8 , R9 ], + [XMM1, XMM2, XMM3]>>>>, + + CCIfType<[i64], CCAssignToRegWithShadow<[RCX , RDX , R8 , R9 ], + [XMM0, XMM1, XMM2, XMM3]>>, + + // The first 4 FP/Vector arguments are passed in XMM registers. + CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCAssignToRegWithShadow<[XMM0, XMM1, XMM2, XMM3], + [RCX , RDX , R8 , R9 ]>>, + + // Integer/FP values get stored in stack slots that are 8 bytes in size and + // 8-byte aligned if there are no more registers to hold them. + CCIfType<[i8, i16, i32, i64, f32, f64], CCAssignToStack<8, 8>> +]>; + +def CC_X86_Win64_VectorCall : CallingConv<[ + CCCustom<"CC_X86_64_VectorCall">, + + // Delegate to fastcall to handle integer types. + CCDelegateTo<CC_X86_Win64_C> +]>; + + +def CC_X86_64_GHC : CallingConv<[ + // Promote i8/i16/i32 arguments to i64. + CCIfType<[i8, i16, i32], CCPromoteToType<i64>>, + + // Pass in STG registers: Base, Sp, Hp, R1, R2, R3, R4, R5, R6, SpLim + CCIfType<[i64], + CCAssignToReg<[R13, RBP, R12, RBX, R14, RSI, RDI, R8, R9, R15]>>, + + // Pass in STG registers: F1, F2, F3, F4, D1, D2 + CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCIfSubtarget<"hasSSE1()", + CCAssignToReg<[XMM1, XMM2, XMM3, XMM4, XMM5, XMM6]>>>, + // AVX + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCIfSubtarget<"hasAVX()", + CCAssignToReg<[YMM1, YMM2, YMM3, YMM4, YMM5, YMM6]>>>, + // AVX-512 + CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCIfSubtarget<"hasAVX512()", + CCAssignToReg<[ZMM1, ZMM2, ZMM3, ZMM4, ZMM5, ZMM6]>>> +]>; + +def CC_X86_64_HiPE : CallingConv<[ + // Promote i8/i16/i32 arguments to i64. + CCIfType<[i8, i16, i32], CCPromoteToType<i64>>, + + // Pass in VM's registers: HP, P, ARG0, ARG1, ARG2, ARG3 + CCIfType<[i64], CCAssignToReg<[R15, RBP, RSI, RDX, RCX, R8]>>, + + // Integer/FP values get stored in stack slots that are 8 bytes in size and + // 8-byte aligned if there are no more registers to hold them. + CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>> +]>; + +def CC_X86_64_WebKit_JS : CallingConv<[ + // Promote i8/i16 arguments to i32. + CCIfType<[i8, i16], CCPromoteToType<i32>>, + + // Only the first integer argument is passed in register. + CCIfType<[i32], CCAssignToReg<[EAX]>>, + CCIfType<[i64], CCAssignToReg<[RAX]>>, + + // The remaining integer arguments are passed on the stack. 32bit integer and + // floating-point arguments are aligned to 4 byte and stored in 4 byte slots. + // 64bit integer and floating-point arguments are aligned to 8 byte and stored + // in 8 byte stack slots. + CCIfType<[i32, f32], CCAssignToStack<4, 4>>, + CCIfType<[i64, f64], CCAssignToStack<8, 8>> +]>; + +// No explicit register is specified for the AnyReg calling convention. The +// register allocator may assign the arguments to any free register. +// +// This calling convention is currently only supported by the stackmap and +// patchpoint intrinsics. All other uses will result in an assert on Debug +// builds. On Release builds we fallback to the X86 C calling convention. +def CC_X86_64_AnyReg : CallingConv<[ + CCCustom<"CC_X86_AnyReg_Error"> +]>; + +//===----------------------------------------------------------------------===// +// X86 C Calling Convention +//===----------------------------------------------------------------------===// + +/// CC_X86_32_Vector_Common - In all X86-32 calling conventions, extra vector +/// values are spilled on the stack. +def CC_X86_32_Vector_Common : CallingConv<[ + // Other SSE vectors get 16-byte stack slots that are 16-byte aligned. + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>, + + // 256-bit AVX vectors get 32-byte stack slots that are 32-byte aligned. + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCAssignToStack<32, 32>>, + + // 512-bit AVX 512-bit vectors get 64-byte stack slots that are 64-byte aligned. + CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCAssignToStack<64, 64>> +]>; + +// CC_X86_32_Vector_Standard - The first 3 vector arguments are passed in +// vector registers +def CC_X86_32_Vector_Standard : CallingConv<[ + // SSE vector arguments are passed in XMM registers. + CCIfNotVarArg<CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCAssignToReg<[XMM0, XMM1, XMM2]>>>, + + // AVX 256-bit vector arguments are passed in YMM registers. + CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCIfSubtarget<"hasAVX()", + CCAssignToReg<[YMM0, YMM1, YMM2]>>>>, + + // AVX 512-bit vector arguments are passed in ZMM registers. + CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCAssignToReg<[ZMM0, ZMM1, ZMM2]>>>, + + CCDelegateTo<CC_X86_32_Vector_Common> +]>; + +// CC_X86_32_Vector_Darwin - The first 4 vector arguments are passed in +// vector registers. +def CC_X86_32_Vector_Darwin : CallingConv<[ + // SSE vector arguments are passed in XMM registers. + CCIfNotVarArg<CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>>, + + // AVX 256-bit vector arguments are passed in YMM registers. + CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCIfSubtarget<"hasAVX()", + CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>>>, + + // AVX 512-bit vector arguments are passed in ZMM registers. + CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3]>>>, + + CCDelegateTo<CC_X86_32_Vector_Common> +]>; + +/// CC_X86_32_Common - In all X86-32 calling conventions, extra integers and FP +/// values are spilled on the stack. +def CC_X86_32_Common : CallingConv<[ + // Handles byval parameters. + CCIfByVal<CCPassByVal<4, 4>>, + + // The first 3 float or double arguments, if marked 'inreg' and if the call + // is not a vararg call and if SSE2 is available, are passed in SSE registers. + CCIfNotVarArg<CCIfInReg<CCIfType<[f32,f64], + CCIfSubtarget<"hasSSE2()", + CCAssignToReg<[XMM0,XMM1,XMM2]>>>>>, + + // The first 3 __m64 vector arguments are passed in mmx registers if the + // call is not a vararg call. + CCIfNotVarArg<CCIfType<[x86mmx], + CCAssignToReg<[MM0, MM1, MM2]>>>, + + // Integer/Float values get stored in stack slots that are 4 bytes in + // size and 4-byte aligned. + CCIfType<[i32, f32], CCAssignToStack<4, 4>>, + + // Doubles get 8-byte slots that are 4-byte aligned. + CCIfType<[f64], CCAssignToStack<8, 4>>, + + // Long doubles get slots whose size depends on the subtarget. + CCIfType<[f80], CCAssignToStack<0, 4>>, + + // Boolean vectors of AVX-512 are passed in SIMD registers. + // The call from AVX to AVX-512 function should work, + // since the boolean types in AVX/AVX2 are promoted by default. + CCIfType<[v2i1], CCPromoteToType<v2i64>>, + CCIfType<[v4i1], CCPromoteToType<v4i32>>, + CCIfType<[v8i1], CCPromoteToType<v8i16>>, + CCIfType<[v16i1], CCPromoteToType<v16i8>>, + CCIfType<[v32i1], CCPromoteToType<v32i8>>, + CCIfType<[v64i1], CCPromoteToType<v64i8>>, + + // __m64 vectors get 8-byte stack slots that are 4-byte aligned. They are + // passed in the parameter area. + CCIfType<[x86mmx], CCAssignToStack<8, 4>>, + + // Darwin passes vectors in a form that differs from the i386 psABI + CCIfSubtarget<"isTargetDarwin()", CCDelegateTo<CC_X86_32_Vector_Darwin>>, + + // Otherwise, drop to 'normal' X86-32 CC + CCDelegateTo<CC_X86_32_Vector_Standard> +]>; + +def CC_X86_32_C : CallingConv<[ + // Promote i1/i8/i16/v1i1 arguments to i32. + CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>, + + // The 'nest' parameter, if any, is passed in ECX. + CCIfNest<CCAssignToReg<[ECX]>>, + + // The first 3 integer arguments, if marked 'inreg' and if the call is not + // a vararg call, are passed in integer registers. + CCIfNotVarArg<CCIfInReg<CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>>>, + + // Otherwise, same as everything else. + CCDelegateTo<CC_X86_32_Common> +]>; + +def CC_X86_32_MCU : CallingConv<[ + // Handles byval parameters. Note that, like FastCC, we can't rely on + // the delegation to CC_X86_32_Common because that happens after code that + // puts arguments in registers. + CCIfByVal<CCPassByVal<4, 4>>, + + // Promote i1/i8/i16/v1i1 arguments to i32. + CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>, + + // If the call is not a vararg call, some arguments may be passed + // in integer registers. + CCIfNotVarArg<CCIfType<[i32], CCCustom<"CC_X86_32_MCUInReg">>>, + + // Otherwise, same as everything else. + CCDelegateTo<CC_X86_32_Common> +]>; + +def CC_X86_32_FastCall : CallingConv<[ + // Promote i1 to i8. + CCIfType<[i1], CCPromoteToType<i8>>, + + // The 'nest' parameter, if any, is passed in EAX. + CCIfNest<CCAssignToReg<[EAX]>>, + + // The first 2 integer arguments are passed in ECX/EDX + CCIfInReg<CCIfType<[ i8], CCAssignToReg<[ CL, DL]>>>, + CCIfInReg<CCIfType<[i16], CCAssignToReg<[ CX, DX]>>>, + CCIfInReg<CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>>, + + // Otherwise, same as everything else. + CCDelegateTo<CC_X86_32_Common> +]>; + +def CC_X86_Win32_VectorCall : CallingConv<[ + // Pass floating point in XMMs + CCCustom<"CC_X86_32_VectorCall">, + + // Delegate to fastcall to handle integer types. + CCDelegateTo<CC_X86_32_FastCall> +]>; + +def CC_X86_32_ThisCall_Common : CallingConv<[ + // The first integer argument is passed in ECX + CCIfType<[i32], CCAssignToReg<[ECX]>>, + + // Otherwise, same as everything else. + CCDelegateTo<CC_X86_32_Common> +]>; + +def CC_X86_32_ThisCall_Mingw : CallingConv<[ + // Promote i1/i8/i16/v1i1 arguments to i32. + CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>, + + CCDelegateTo<CC_X86_32_ThisCall_Common> +]>; + +def CC_X86_32_ThisCall_Win : CallingConv<[ + // Promote i1/i8/i16/v1i1 arguments to i32. + CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>, + + // Pass sret arguments indirectly through stack. + CCIfSRet<CCAssignToStack<4, 4>>, + + CCDelegateTo<CC_X86_32_ThisCall_Common> +]>; + +def CC_X86_32_ThisCall : CallingConv<[ + CCIfSubtarget<"isTargetCygMing()", CCDelegateTo<CC_X86_32_ThisCall_Mingw>>, + CCDelegateTo<CC_X86_32_ThisCall_Win> +]>; + +def CC_X86_32_FastCC : CallingConv<[ + // Handles byval parameters. Note that we can't rely on the delegation + // to CC_X86_32_Common for this because that happens after code that + // puts arguments in registers. + CCIfByVal<CCPassByVal<4, 4>>, + + // Promote i1/i8/i16/v1i1 arguments to i32. + CCIfType<[i1, i8, i16, v1i1], CCPromoteToType<i32>>, + + // The 'nest' parameter, if any, is passed in EAX. + CCIfNest<CCAssignToReg<[EAX]>>, + + // The first 2 integer arguments are passed in ECX/EDX + CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>, + + // The first 3 float or double arguments, if the call is not a vararg + // call and if SSE2 is available, are passed in SSE registers. + CCIfNotVarArg<CCIfType<[f32,f64], + CCIfSubtarget<"hasSSE2()", + CCAssignToReg<[XMM0,XMM1,XMM2]>>>>, + + // Doubles get 8-byte slots that are 8-byte aligned. + CCIfType<[f64], CCAssignToStack<8, 8>>, + + // Otherwise, same as everything else. + CCDelegateTo<CC_X86_32_Common> +]>; + +def CC_X86_32_GHC : CallingConv<[ + // Promote i8/i16 arguments to i32. + CCIfType<[i8, i16], CCPromoteToType<i32>>, + + // Pass in STG registers: Base, Sp, Hp, R1 + CCIfType<[i32], CCAssignToReg<[EBX, EBP, EDI, ESI]>> +]>; + +def CC_X86_32_HiPE : CallingConv<[ + // Promote i8/i16 arguments to i32. + CCIfType<[i8, i16], CCPromoteToType<i32>>, + + // Pass in VM's registers: HP, P, ARG0, ARG1, ARG2 + CCIfType<[i32], CCAssignToReg<[ESI, EBP, EAX, EDX, ECX]>>, + + // Integer/Float values get stored in stack slots that are 4 bytes in + // size and 4-byte aligned. + CCIfType<[i32, f32], CCAssignToStack<4, 4>> +]>; + +// X86-64 Intel OpenCL built-ins calling convention. +def CC_Intel_OCL_BI : CallingConv<[ + + CCIfType<[i32], CCIfSubtarget<"isTargetWin64()", CCAssignToReg<[ECX, EDX, R8D, R9D]>>>, + CCIfType<[i64], CCIfSubtarget<"isTargetWin64()", CCAssignToReg<[RCX, RDX, R8, R9 ]>>>, + + CCIfType<[i32], CCIfSubtarget<"is64Bit()", CCAssignToReg<[EDI, ESI, EDX, ECX]>>>, + CCIfType<[i64], CCIfSubtarget<"is64Bit()", CCAssignToReg<[RDI, RSI, RDX, RCX]>>>, + + CCIfType<[i32], CCAssignToStack<4, 4>>, + + // The SSE vector arguments are passed in XMM registers. + CCIfType<[f32, f64, v4i32, v2i64, v4f32, v2f64], + CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>, + + // The 256-bit vector arguments are passed in YMM registers. + CCIfType<[v8f32, v4f64, v8i32, v4i64], + CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>, + + // The 512-bit vector arguments are passed in ZMM registers. + CCIfType<[v16f32, v8f64, v16i32, v8i64], + CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3]>>, + + // Pass masks in mask registers + CCIfType<[v16i1, v8i1], CCAssignToReg<[K1]>>, + + CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>, + CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64_C>>, + CCDelegateTo<CC_X86_32_C> +]>; + +def CC_X86_32_Intr : CallingConv<[ + CCAssignToStack<4, 4> +]>; + +def CC_X86_64_Intr : CallingConv<[ + CCAssignToStack<8, 8> +]>; + +//===----------------------------------------------------------------------===// +// X86 Root Argument Calling Conventions +//===----------------------------------------------------------------------===// + +// This is the root argument convention for the X86-32 backend. +def CC_X86_32 : CallingConv<[ + // X86_INTR calling convention is valid in MCU target and should override the + // MCU calling convention. Thus, this should be checked before isTargetMCU(). + CCIfCC<"CallingConv::X86_INTR", CCDelegateTo<CC_X86_32_Intr>>, + CCIfSubtarget<"isTargetMCU()", CCDelegateTo<CC_X86_32_MCU>>, + CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo<CC_X86_32_FastCall>>, + CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win32_VectorCall>>, + CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo<CC_X86_32_ThisCall>>, + CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>, + CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>, + CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_32_HiPE>>, + CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo<CC_X86_32_RegCall>>, + + // Otherwise, drop to normal X86-32 CC + CCDelegateTo<CC_X86_32_C> +]>; + +// This is the root argument convention for the X86-64 backend. +def CC_X86_64 : CallingConv<[ + CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_64_GHC>>, + CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_64_HiPE>>, + CCIfCC<"CallingConv::WebKit_JS", CCDelegateTo<CC_X86_64_WebKit_JS>>, + CCIfCC<"CallingConv::AnyReg", CCDelegateTo<CC_X86_64_AnyReg>>, + CCIfCC<"CallingConv::Win64", CCDelegateTo<CC_X86_Win64_C>>, + CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<CC_X86_64_C>>, + CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win64_VectorCall>>, + CCIfCC<"CallingConv::HHVM", CCDelegateTo<CC_X86_64_HHVM>>, + CCIfCC<"CallingConv::HHVM_C", CCDelegateTo<CC_X86_64_HHVM_C>>, + CCIfCC<"CallingConv::X86_RegCall", + CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_RegCall>>>, + CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo<CC_X86_SysV64_RegCall>>, + CCIfCC<"CallingConv::X86_INTR", CCDelegateTo<CC_X86_64_Intr>>, + + // Mingw64 and native Win64 use Win64 CC + CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>, + + // Otherwise, drop to normal X86-64 CC + CCDelegateTo<CC_X86_64_C> +]>; + +// This is the argument convention used for the entire X86 backend. +def CC_X86 : CallingConv<[ + CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<CC_Intel_OCL_BI>>, + CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64>>, + CCDelegateTo<CC_X86_32> +]>; + +//===----------------------------------------------------------------------===// +// Callee-saved Registers. +//===----------------------------------------------------------------------===// + +def CSR_NoRegs : CalleeSavedRegs<(add)>; + +def CSR_32 : CalleeSavedRegs<(add ESI, EDI, EBX, EBP)>; +def CSR_64 : CalleeSavedRegs<(add RBX, R12, R13, R14, R15, RBP)>; + +def CSR_64_SwiftError : CalleeSavedRegs<(sub CSR_64, R12)>; + +def CSR_32EHRet : CalleeSavedRegs<(add EAX, EDX, CSR_32)>; +def CSR_64EHRet : CalleeSavedRegs<(add RAX, RDX, CSR_64)>; + +def CSR_Win64_NoSSE : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15)>; + +def CSR_Win64 : CalleeSavedRegs<(add CSR_Win64_NoSSE, + (sequence "XMM%u", 6, 15))>; + +def CSR_Win64_SwiftError : CalleeSavedRegs<(sub CSR_Win64, R12)>; + +// The function used by Darwin to obtain the address of a thread-local variable +// uses rdi to pass a single parameter and rax for the return value. All other +// GPRs are preserved. +def CSR_64_TLS_Darwin : CalleeSavedRegs<(add CSR_64, RCX, RDX, RSI, + R8, R9, R10, R11)>; + +// CSRs that are handled by prologue, epilogue. +def CSR_64_CXX_TLS_Darwin_PE : CalleeSavedRegs<(add RBP)>; + +// CSRs that are handled explicitly via copies. +def CSR_64_CXX_TLS_Darwin_ViaCopy : CalleeSavedRegs<(sub CSR_64_TLS_Darwin, RBP)>; + +// All GPRs - except r11 +def CSR_64_RT_MostRegs : CalleeSavedRegs<(add CSR_64, RAX, RCX, RDX, RSI, RDI, + R8, R9, R10, RSP)>; + +// All registers - except r11 +def CSR_64_RT_AllRegs : CalleeSavedRegs<(add CSR_64_RT_MostRegs, + (sequence "XMM%u", 0, 15))>; +def CSR_64_RT_AllRegs_AVX : CalleeSavedRegs<(add CSR_64_RT_MostRegs, + (sequence "YMM%u", 0, 15))>; + +def CSR_64_MostRegs : CalleeSavedRegs<(add RBX, RCX, RDX, RSI, RDI, R8, R9, R10, + R11, R12, R13, R14, R15, RBP, + (sequence "XMM%u", 0, 15))>; + +def CSR_32_AllRegs : CalleeSavedRegs<(add EAX, EBX, ECX, EDX, EBP, ESI, + EDI)>; +def CSR_32_AllRegs_SSE : CalleeSavedRegs<(add CSR_32_AllRegs, + (sequence "XMM%u", 0, 7))>; +def CSR_32_AllRegs_AVX : CalleeSavedRegs<(add CSR_32_AllRegs, + (sequence "YMM%u", 0, 7))>; +def CSR_32_AllRegs_AVX512 : CalleeSavedRegs<(add CSR_32_AllRegs, + (sequence "ZMM%u", 0, 7), + (sequence "K%u", 0, 7))>; + +def CSR_64_AllRegs : CalleeSavedRegs<(add CSR_64_MostRegs, RAX)>; +def CSR_64_AllRegs_NoSSE : CalleeSavedRegs<(add RAX, RBX, RCX, RDX, RSI, RDI, R8, R9, + R10, R11, R12, R13, R14, R15, RBP)>; +def CSR_64_AllRegs_AVX : CalleeSavedRegs<(sub (add CSR_64_MostRegs, RAX, + (sequence "YMM%u", 0, 15)), + (sequence "XMM%u", 0, 15))>; +def CSR_64_AllRegs_AVX512 : CalleeSavedRegs<(sub (add CSR_64_MostRegs, RAX, + (sequence "ZMM%u", 0, 31), + (sequence "K%u", 0, 7)), + (sequence "XMM%u", 0, 15))>; + +// Standard C + YMM6-15 +def CSR_Win64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, + R13, R14, R15, + (sequence "YMM%u", 6, 15))>; + +def CSR_Win64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, + R12, R13, R14, R15, + (sequence "ZMM%u", 6, 21), + K4, K5, K6, K7)>; +//Standard C + XMM 8-15 +def CSR_64_Intel_OCL_BI : CalleeSavedRegs<(add CSR_64, + (sequence "XMM%u", 8, 15))>; + +//Standard C + YMM 8-15 +def CSR_64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add CSR_64, + (sequence "YMM%u", 8, 15))>; + +def CSR_64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RDI, RSI, R14, R15, + (sequence "ZMM%u", 16, 31), + K4, K5, K6, K7)>; + +// Only R12 is preserved for PHP calls in HHVM. +def CSR_64_HHVM : CalleeSavedRegs<(add R12)>; + +// Register calling convention preserves few GPR and XMM8-15 +def CSR_32_RegCall_NoSSE : CalleeSavedRegs<(add ESI, EDI, EBX, EBP, ESP)>; +def CSR_32_RegCall : CalleeSavedRegs<(add CSR_32_RegCall_NoSSE, + (sequence "XMM%u", 4, 7))>; +def CSR_Win64_RegCall_NoSSE : CalleeSavedRegs<(add RBX, RBP, RSP, + (sequence "R%u", 10, 15))>; +def CSR_Win64_RegCall : CalleeSavedRegs<(add CSR_Win64_RegCall_NoSSE, + (sequence "XMM%u", 8, 15))>; +def CSR_SysV64_RegCall_NoSSE : CalleeSavedRegs<(add RBX, RBP, RSP, + (sequence "R%u", 12, 15))>; +def CSR_SysV64_RegCall : CalleeSavedRegs<(add CSR_SysV64_RegCall_NoSSE, + (sequence "XMM%u", 8, 15))>; + diff --git a/capstone/suite/synctools/tablegen/X86/back/X86Capstone.td b/capstone/suite/synctools/tablegen/X86/back/X86Capstone.td new file mode 100644 index 000000000..ca21b63a0 --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86Capstone.td @@ -0,0 +1,7 @@ +// Capstone definitions fix for X86 LLVM instructions. + +let Defs = [EFLAGS] in + def INT1 : I<0xf1, RawFrm, (outs), (ins), "int1", []>; + +// def FNCLEX : I<0xDB, MRM_E2, (outs), (ins), "fnclex", [], IIC_FNCLEX>; +def FSETPM : I<0xDB, MRM_E4, (outs), (ins), "fsetpm", []>; diff --git a/capstone/suite/synctools/tablegen/X86/back/X86CapstoneFull.td b/capstone/suite/synctools/tablegen/X86/back/X86CapstoneFull.td new file mode 100644 index 000000000..5bd4095dd --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86CapstoneFull.td @@ -0,0 +1,103 @@ +// Capstone definitions fix for X86 LLVM instructions. + +let Defs = [EFLAGS] in + def INT1 : I<0xf1, RawFrm, (outs), (ins), "int1", []>; + +def FSETPM : I<0xDB, MRM_E4, (outs), (ins), "fsetpm", []>; + +// Capstone: comment out below lines for X86 Reduce mode + +/* +// X87 Floating Point Stack. +include "X86InstrFPStack.td" + +// SIMD support (SSE, MMX and AVX) +include "X86InstrFragmentsSIMD.td" + +// FMA - Fused Multiply-Add support (requires FMA) +include "X86InstrFMA.td" + +// XOP +include "X86InstrXOP.td" + +// SSE, MMX and 3DNow! vector support. +include "X86InstrSSE.td" +include "X86InstrAVX512.td" +include "X86InstrMMX.td" +include "X86Instr3DNow.td" + +// MPX instructions +include "X86InstrMPX.td" + +//include "X86InstrTSX.td" +include "X86InstrSGX.td" + +// Various unary fpstack operations default to operating on ST1. +// For example, "fxch" -> "fxch %st(1)" +def : InstAlias<"faddp", (ADD_FPrST0 ST1), 0>; +def: InstAlias<"fadd", (ADD_FPrST0 ST1), 0>; +def : InstAlias<"fsub{|r}p", (SUBR_FPrST0 ST1), 0>; +def : InstAlias<"fsub{r|}p", (SUB_FPrST0 ST1), 0>; +def : InstAlias<"fmul", (MUL_FPrST0 ST1), 0>; +def : InstAlias<"fmulp", (MUL_FPrST0 ST1), 0>; +def : InstAlias<"fdiv{|r}p", (DIVR_FPrST0 ST1), 0>; +def : InstAlias<"fdiv{r|}p", (DIV_FPrST0 ST1), 0>; +def : InstAlias<"fxch", (XCH_F ST1), 0>; +def : InstAlias<"fcom", (COM_FST0r ST1), 0>; +def : InstAlias<"fcomp", (COMP_FST0r ST1), 0>; +def : InstAlias<"fcomi", (COM_FIr ST1), 0>; +def : InstAlias<"fcompi", (COM_FIPr ST1), 0>; +def : InstAlias<"fucom", (UCOM_Fr ST1), 0>; +def : InstAlias<"fucomp", (UCOM_FPr ST1), 0>; +def : InstAlias<"fucomi", (UCOM_FIr ST1), 0>; +def : InstAlias<"fucompi", (UCOM_FIPr ST1), 0>; + +// Handle fmul/fadd/fsub/fdiv instructions with explicitly written st(0) op. +// For example, "fadd %st(4), %st(0)" -> "fadd %st(4)". We also disambiguate +// instructions like "fadd %st(0), %st(0)" as "fadd %st(0)" for consistency with +// gas. +multiclass FpUnaryAlias<string Mnemonic, Instruction Inst, bit EmitAlias = 1> { + def : InstAlias<!strconcat(Mnemonic, "\t{$op, %st(0)|st(0), $op}"), + (Inst RST:$op), EmitAlias>; + def : InstAlias<!strconcat(Mnemonic, "\t{%st(0), %st(0)|st(0), st(0)}"), + (Inst ST0), EmitAlias>; +} + +defm : FpUnaryAlias<"fadd", ADD_FST0r>; +defm : FpUnaryAlias<"faddp", ADD_FPrST0, 0>; +defm : FpUnaryAlias<"fsub", SUB_FST0r>; +defm : FpUnaryAlias<"fsub{|r}p", SUBR_FPrST0>; +defm : FpUnaryAlias<"fsubr", SUBR_FST0r>; +defm : FpUnaryAlias<"fsub{r|}p", SUB_FPrST0>; +defm : FpUnaryAlias<"fmul", MUL_FST0r>; +defm : FpUnaryAlias<"fmulp", MUL_FPrST0>; +defm : FpUnaryAlias<"fdiv", DIV_FST0r>; +defm : FpUnaryAlias<"fdiv{|r}p", DIVR_FPrST0>; +defm : FpUnaryAlias<"fdivr", DIVR_FST0r>; +defm : FpUnaryAlias<"fdiv{r|}p", DIV_FPrST0>; +defm : FpUnaryAlias<"fcomi", COM_FIr, 0>; +defm : FpUnaryAlias<"fucomi", UCOM_FIr, 0>; +defm : FpUnaryAlias<"fcompi", COM_FIPr>; +defm : FpUnaryAlias<"fucompi", UCOM_FIPr>; + + +// Handle "f{mulp,addp} st(0), $op" the same as "f{mulp,addp} $op", since they +// commute. We also allow fdiv[r]p/fsubrp even though they don't commute, +// solely because gas supports it. +def : InstAlias<"faddp\t{%st(0), $op|$op, st(0)}", (ADD_FPrST0 RST:$op), 0>; +def : InstAlias<"fmulp\t{%st(0), $op|$op, st(0)}", (MUL_FPrST0 RST:$op)>; +def : InstAlias<"fsub{|r}p\t{%st(0), $op|$op, st(0)}", (SUBR_FPrST0 RST:$op)>; +def : InstAlias<"fsub{r|}p\t{%st(0), $op|$op, st(0)}", (SUB_FPrST0 RST:$op)>; +def : InstAlias<"fdiv{|r}p\t{%st(0), $op|$op, st(0)}", (DIVR_FPrST0 RST:$op)>; +def : InstAlias<"fdiv{r|}p\t{%st(0), $op|$op, st(0)}", (DIV_FPrST0 RST:$op)>; + +def : InstAlias<"fnstsw" , (FNSTSW16r), 0>; + +// Match 'movd GR64, MMX' as an alias for movq to be compatible with gas, +// which supports this due to an old AMD documentation bug when 64-bit mode was +// created. +def : InstAlias<"movd\t{$src, $dst|$dst, $src}", + (MMX_MOVD64to64rr VR64:$dst, GR64:$src), 0>; +def : InstAlias<"movd\t{$src, $dst|$dst, $src}", + (MMX_MOVD64from64rr GR64:$dst, VR64:$src), 0>; +*/ diff --git a/capstone/suite/synctools/tablegen/X86/back/X86CapstoneReduce.td b/capstone/suite/synctools/tablegen/X86/back/X86CapstoneReduce.td new file mode 100644 index 000000000..2c0920e74 --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86CapstoneReduce.td @@ -0,0 +1,101 @@ +// Capstone definitions fix for X86 LLVM instructions. + +let Defs = [EFLAGS] in + def INT1 : I<0xf1, RawFrm, (outs), (ins), "int1", []>; + +def FSETPM : I<0xDB, MRM_E4, (outs), (ins), "fsetpm", []>; + +// Capstone: comment out below lines for X86 Reduce mode + +// X87 Floating Point Stack. +//include "X86InstrFPStack.td" + +// SIMD support (SSE, MMX and AVX) +//include "X86InstrFragmentsSIMD.td" + +// FMA - Fused Multiply-Add support (requires FMA) +//include "X86InstrFMA.td" + +// XOP +//include "X86InstrXOP.td" + +// SSE, MMX and 3DNow! vector support. +//include "X86InstrSSE.td" +//include "X86InstrAVX512.td" +//include "X86InstrMMX.td" +//include "X86Instr3DNow.td" + +// MPX instructions +//include "X86InstrMPX.td" + +//include "X86InstrTSX.td" +//include "X86InstrSGX.td" + +// Various unary fpstack operations default to operating on ST1. +// For example, "fxch" -> "fxch %st(1)" +//def : InstAlias<"faddp", (ADD_FPrST0 ST1), 0>; +//def: InstAlias<"fadd", (ADD_FPrST0 ST1), 0>; +//def : InstAlias<"fsub{|r}p", (SUBR_FPrST0 ST1), 0>; +//def : InstAlias<"fsub{r|}p", (SUB_FPrST0 ST1), 0>; +//def : InstAlias<"fmul", (MUL_FPrST0 ST1), 0>; +//def : InstAlias<"fmulp", (MUL_FPrST0 ST1), 0>; +//def : InstAlias<"fdiv{|r}p", (DIVR_FPrST0 ST1), 0>; +//def : InstAlias<"fdiv{r|}p", (DIV_FPrST0 ST1), 0>; +//def : InstAlias<"fxch", (XCH_F ST1), 0>; +//def : InstAlias<"fcom", (COM_FST0r ST1), 0>; +//def : InstAlias<"fcomp", (COMP_FST0r ST1), 0>; +//def : InstAlias<"fcomi", (COM_FIr ST1), 0>; +//def : InstAlias<"fcompi", (COM_FIPr ST1), 0>; +//def : InstAlias<"fucom", (UCOM_Fr ST1), 0>; +//def : InstAlias<"fucomp", (UCOM_FPr ST1), 0>; +//def : InstAlias<"fucomi", (UCOM_FIr ST1), 0>; +//def : InstAlias<"fucompi", (UCOM_FIPr ST1), 0>; + +// Handle fmul/fadd/fsub/fdiv instructions with explicitly written st(0) op. +// For example, "fadd %st(4), %st(0)" -> "fadd %st(4)". We also disambiguate +// instructions like "fadd %st(0), %st(0)" as "fadd %st(0)" for consistency with +// gas. +multiclass FpUnaryAlias<string Mnemonic, Instruction Inst, bit EmitAlias = 1> { + def : InstAlias<!strconcat(Mnemonic, "\t{$op, %st(0)|st(0), $op}"), + (Inst RST:$op), EmitAlias>; + def : InstAlias<!strconcat(Mnemonic, "\t{%st(0), %st(0)|st(0), st(0)}"), + (Inst ST0), EmitAlias>; +} + +//defm : FpUnaryAlias<"fadd", ADD_FST0r>; +//defm : FpUnaryAlias<"faddp", ADD_FPrST0, 0>; +//defm : FpUnaryAlias<"fsub", SUB_FST0r>; +//defm : FpUnaryAlias<"fsub{|r}p", SUBR_FPrST0>; +//defm : FpUnaryAlias<"fsubr", SUBR_FST0r>; +//defm : FpUnaryAlias<"fsub{r|}p", SUB_FPrST0>; +//defm : FpUnaryAlias<"fmul", MUL_FST0r>; +//defm : FpUnaryAlias<"fmulp", MUL_FPrST0>; +//defm : FpUnaryAlias<"fdiv", DIV_FST0r>; +//defm : FpUnaryAlias<"fdiv{|r}p", DIVR_FPrST0>; +//defm : FpUnaryAlias<"fdivr", DIVR_FST0r>; +//defm : FpUnaryAlias<"fdiv{r|}p", DIV_FPrST0>; +//defm : FpUnaryAlias<"fcomi", COM_FIr, 0>; +//defm : FpUnaryAlias<"fucomi", UCOM_FIr, 0>; +//defm : FpUnaryAlias<"fcompi", COM_FIPr>; +//defm : FpUnaryAlias<"fucompi", UCOM_FIPr>; + + +// Handle "f{mulp,addp} st(0), $op" the same as "f{mulp,addp} $op", since they +// commute. We also allow fdiv[r]p/fsubrp even though they don't commute, +// solely because gas supports it. +//def : InstAlias<"faddp\t{%st(0), $op|$op, st(0)}", (ADD_FPrST0 RST:$op), 0>; +//def : InstAlias<"fmulp\t{%st(0), $op|$op, st(0)}", (MUL_FPrST0 RST:$op)>; +//def : InstAlias<"fsub{|r}p\t{%st(0), $op|$op, st(0)}", (SUBR_FPrST0 RST:$op)>; +//def : InstAlias<"fsub{r|}p\t{%st(0), $op|$op, st(0)}", (SUB_FPrST0 RST:$op)>; +//def : InstAlias<"fdiv{|r}p\t{%st(0), $op|$op, st(0)}", (DIVR_FPrST0 RST:$op)>; +//def : InstAlias<"fdiv{r|}p\t{%st(0), $op|$op, st(0)}", (DIV_FPrST0 RST:$op)>; +// +//def : InstAlias<"fnstsw" , (FNSTSW16r), 0>; + +// Match 'movd GR64, MMX' as an alias for movq to be compatible with gas, +// which supports this due to an old AMD documentation bug when 64-bit mode was +// created. +//def : InstAlias<"movd\t{$src, $dst|$dst, $src}", +// (MMX_MOVD64to64rr VR64:$dst, GR64:$src), 0>; +//def : InstAlias<"movd\t{$src, $dst|$dst, $src}", +// (MMX_MOVD64from64rr GR64:$dst, VR64:$src), 0>; diff --git a/capstone/suite/synctools/tablegen/X86/back/X86Instr3DNow.td b/capstone/suite/synctools/tablegen/X86/back/X86Instr3DNow.td new file mode 100644 index 000000000..46dc6bf76 --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86Instr3DNow.td @@ -0,0 +1,111 @@ +//===-- X86Instr3DNow.td - The 3DNow! Instruction Set ------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the 3DNow! instruction set, which extends MMX to support +// floating point and also adds a few more random instructions for good measure. +// +//===----------------------------------------------------------------------===// + +class I3DNow<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pat> + : I<o, F, outs, ins, asm, pat>, Requires<[Has3DNow]> { +} + +class I3DNow_binop<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat> + : I3DNow<o, F, (outs VR64:$dst), ins, + !strconcat(Mnemonic, "\t{$src2, $dst|$dst, $src2}"), pat>, ThreeDNow { + let Constraints = "$src1 = $dst"; +} + +class I3DNow_conv<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat> + : I3DNow<o, F, (outs VR64:$dst), ins, + !strconcat(Mnemonic, "\t{$src, $dst|$dst, $src}"), pat>, ThreeDNow; + +multiclass I3DNow_binop_rm_int<bits<8> opc, string Mn, + X86FoldableSchedWrite sched, bit Commutable = 0, + string Ver = ""> { + let isCommutable = Commutable in + def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn, + [(set VR64:$dst, (!cast<Intrinsic>( + !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1, VR64:$src2))]>, + Sched<[sched]>; + def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn, + [(set VR64:$dst, (!cast<Intrinsic>( + !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1, + (bitconvert (load_mmx addr:$src2))))]>, + Sched<[sched.Folded, ReadAfterLd]>; +} + +multiclass I3DNow_conv_rm_int<bits<8> opc, string Mn, + X86FoldableSchedWrite sched, string Ver = ""> { + def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src), Mn, + [(set VR64:$dst, (!cast<Intrinsic>( + !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src))]>, + Sched<[sched]>; + def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src), Mn, + [(set VR64:$dst, (!cast<Intrinsic>( + !strconcat("int_x86_3dnow", Ver, "_", Mn)) + (bitconvert (load_mmx addr:$src))))]>, + Sched<[sched.Folded, ReadAfterLd]>; +} + +defm PAVGUSB : I3DNow_binop_rm_int<0xBF, "pavgusb", SchedWriteVecALU.MMX, 1>; +defm PF2ID : I3DNow_conv_rm_int<0x1D, "pf2id", WriteCvtPS2I>; +defm PFACC : I3DNow_binop_rm_int<0xAE, "pfacc", WriteFAdd>; +defm PFADD : I3DNow_binop_rm_int<0x9E, "pfadd", WriteFAdd, 1>; +defm PFCMPEQ : I3DNow_binop_rm_int<0xB0, "pfcmpeq", WriteFAdd, 1>; +defm PFCMPGE : I3DNow_binop_rm_int<0x90, "pfcmpge", WriteFAdd>; +defm PFCMPGT : I3DNow_binop_rm_int<0xA0, "pfcmpgt", WriteFAdd>; +defm PFMAX : I3DNow_binop_rm_int<0xA4, "pfmax", WriteFAdd>; +defm PFMIN : I3DNow_binop_rm_int<0x94, "pfmin", WriteFAdd>; +defm PFMUL : I3DNow_binop_rm_int<0xB4, "pfmul", WriteFAdd, 1>; +defm PFRCP : I3DNow_conv_rm_int<0x96, "pfrcp", WriteFAdd>; +defm PFRCPIT1 : I3DNow_binop_rm_int<0xA6, "pfrcpit1", WriteFAdd>; +defm PFRCPIT2 : I3DNow_binop_rm_int<0xB6, "pfrcpit2", WriteFAdd>; +defm PFRSQIT1 : I3DNow_binop_rm_int<0xA7, "pfrsqit1", WriteFAdd>; +defm PFRSQRT : I3DNow_conv_rm_int<0x97, "pfrsqrt", WriteFAdd>; +defm PFSUB : I3DNow_binop_rm_int<0x9A, "pfsub", WriteFAdd, 1>; +defm PFSUBR : I3DNow_binop_rm_int<0xAA, "pfsubr", WriteFAdd, 1>; +defm PI2FD : I3DNow_conv_rm_int<0x0D, "pi2fd", WriteCvtI2PS>; +defm PMULHRW : I3DNow_binop_rm_int<0xB7, "pmulhrw", SchedWriteVecIMul.MMX, 1>; + +let SchedRW = [WriteEMMS] in +def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms", + [(int_x86_mmx_femms)]>, TB; + +// PREFETCHWT1 is supported we want to use it for everything but T0. +def PrefetchWLevel : PatFrag<(ops), (i32 imm), [{ + return N->getSExtValue() == 3 || !Subtarget->hasPREFETCHWT1(); +}]>; + +// Use PREFETCHWT1 for NTA, T2, T1. +def PrefetchWT1Level : ImmLeaf<i32, [{ + return Imm < 3; +}]>; + +let SchedRW = [WriteLoad] in { +let Predicates = [Has3DNow, NoSSEPrefetch] in +def PREFETCH : I3DNow<0x0D, MRM0m, (outs), (ins i8mem:$addr), + "prefetch\t$addr", + [(prefetch addr:$addr, imm, imm, (i32 1))]>, TB; + +def PREFETCHW : I<0x0D, MRM1m, (outs), (ins i8mem:$addr), "prefetchw\t$addr", + [(prefetch addr:$addr, (i32 1), (i32 PrefetchWLevel), (i32 1))]>, + TB, Requires<[HasPrefetchW]>; + +def PREFETCHWT1 : I<0x0D, MRM2m, (outs), (ins i8mem:$addr), "prefetchwt1\t$addr", + [(prefetch addr:$addr, (i32 1), (i32 PrefetchWT1Level), (i32 1))]>, + TB, Requires<[HasPREFETCHWT1]>; +} + +// "3DNowA" instructions +defm PF2IW : I3DNow_conv_rm_int<0x1C, "pf2iw", WriteCvtPS2I, "a">; +defm PI2FW : I3DNow_conv_rm_int<0x0C, "pi2fw", WriteCvtI2PS, "a">; +defm PFNACC : I3DNow_binop_rm_int<0x8A, "pfnacc", WriteFAdd, 0, "a">; +defm PFPNACC : I3DNow_binop_rm_int<0x8E, "pfpnacc", WriteFAdd, 0, "a">; +defm PSWAPD : I3DNow_conv_rm_int<0xBB, "pswapd", SchedWriteShuffle.MMX, "a">; diff --git a/capstone/suite/synctools/tablegen/X86/back/X86InstrAVX512.td b/capstone/suite/synctools/tablegen/X86/back/X86InstrAVX512.td new file mode 100644 index 000000000..43f16634c --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86InstrAVX512.td @@ -0,0 +1,11968 @@ +//===-- X86InstrAVX512.td - AVX512 Instruction Set ---------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 AVX512 instruction set, defining the +// instructions, and properties of the instructions which are needed for code +// generation, machine code emission, and analysis. +// +//===----------------------------------------------------------------------===// + +// Group template arguments that can be derived from the vector type (EltNum x +// EltVT). These are things like the register class for the writemask, etc. +// The idea is to pass one of these as the template argument rather than the +// individual arguments. +// The template is also used for scalar types, in this case numelts is 1. +class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc, + string suffix = ""> { + RegisterClass RC = rc; + ValueType EltVT = eltvt; + int NumElts = numelts; + + // Corresponding mask register class. + RegisterClass KRC = !cast<RegisterClass>("VK" # NumElts); + + // Corresponding write-mask register class. + RegisterClass KRCWM = !cast<RegisterClass>("VK" # NumElts # "WM"); + + // The mask VT. + ValueType KVT = !cast<ValueType>("v" # NumElts # "i1"); + + // Suffix used in the instruction mnemonic. + string Suffix = suffix; + + // VTName is a string name for vector VT. For vector types it will be + // v # NumElts # EltVT, so for vector of 8 elements of i32 it will be v8i32 + // It is a little bit complex for scalar types, where NumElts = 1. + // In this case we build v4f32 or v2f64 + string VTName = "v" # !if (!eq (NumElts, 1), + !if (!eq (EltVT.Size, 32), 4, + !if (!eq (EltVT.Size, 64), 2, NumElts)), NumElts) # EltVT; + + // The vector VT. + ValueType VT = !cast<ValueType>(VTName); + + string EltTypeName = !cast<string>(EltVT); + // Size of the element type in bits, e.g. 32 for v16i32. + string EltSizeName = !subst("i", "", !subst("f", "", EltTypeName)); + int EltSize = EltVT.Size; + + // "i" for integer types and "f" for floating-point types + string TypeVariantName = !subst(EltSizeName, "", EltTypeName); + + // Size of RC in bits, e.g. 512 for VR512. + int Size = VT.Size; + + // The corresponding memory operand, e.g. i512mem for VR512. + X86MemOperand MemOp = !cast<X86MemOperand>(TypeVariantName # Size # "mem"); + X86MemOperand ScalarMemOp = !cast<X86MemOperand>(EltVT # "mem"); + // FP scalar memory operand for intrinsics - ssmem/sdmem. + Operand IntScalarMemOp = !if (!eq (EltTypeName, "f32"), !cast<Operand>("ssmem"), + !if (!eq (EltTypeName, "f64"), !cast<Operand>("sdmem"), ?)); + + // Load patterns + // Note: For 128/256-bit integer VT we choose loadv2i64/loadv4i64 + // due to load promotion during legalization + PatFrag LdFrag = !cast<PatFrag>("load" # + !if (!eq (TypeVariantName, "i"), + !if (!eq (Size, 128), "v2i64", + !if (!eq (Size, 256), "v4i64", + !if (!eq (Size, 512), "v8i64", + VTName))), VTName)); + + PatFrag AlignedLdFrag = !cast<PatFrag>("alignedload" # + !if (!eq (TypeVariantName, "i"), + !if (!eq (Size, 128), "v2i64", + !if (!eq (Size, 256), "v4i64", + !if (!eq (Size, 512), "v8i64", + VTName))), VTName)); + + PatFrag ScalarLdFrag = !cast<PatFrag>("load" # EltVT); + + ComplexPattern ScalarIntMemCPat = !if (!eq (EltTypeName, "f32"), + !cast<ComplexPattern>("sse_load_f32"), + !if (!eq (EltTypeName, "f64"), + !cast<ComplexPattern>("sse_load_f64"), + ?)); + + // The string to specify embedded broadcast in assembly. + string BroadcastStr = "{1to" # NumElts # "}"; + + // 8-bit compressed displacement tuple/subvector format. This is only + // defined for NumElts <= 8. + CD8VForm CD8TupleForm = !if (!eq (!srl(NumElts, 4), 0), + !cast<CD8VForm>("CD8VT" # NumElts), ?); + + SubRegIndex SubRegIdx = !if (!eq (Size, 128), sub_xmm, + !if (!eq (Size, 256), sub_ymm, ?)); + + Domain ExeDomain = !if (!eq (EltTypeName, "f32"), SSEPackedSingle, + !if (!eq (EltTypeName, "f64"), SSEPackedDouble, + SSEPackedInt)); + + RegisterClass FRC = !if (!eq (EltTypeName, "f32"), FR32X, FR64X); + + // A vector tye of the same width with element type i64. This is used to + // create patterns for logic ops. + ValueType i64VT = !cast<ValueType>("v" # !srl(Size, 6) # "i64"); + + // A vector type of the same width with element type i32. This is used to + // create the canonical constant zero node ImmAllZerosV. + ValueType i32VT = !cast<ValueType>("v" # !srl(Size, 5) # "i32"); + dag ImmAllZerosV = (VT (bitconvert (i32VT immAllZerosV))); + + string ZSuffix = !if (!eq (Size, 128), "Z128", + !if (!eq (Size, 256), "Z256", "Z")); +} + +def v64i8_info : X86VectorVTInfo<64, i8, VR512, "b">; +def v32i16_info : X86VectorVTInfo<32, i16, VR512, "w">; +def v16i32_info : X86VectorVTInfo<16, i32, VR512, "d">; +def v8i64_info : X86VectorVTInfo<8, i64, VR512, "q">; +def v16f32_info : X86VectorVTInfo<16, f32, VR512, "ps">; +def v8f64_info : X86VectorVTInfo<8, f64, VR512, "pd">; + +// "x" in v32i8x_info means RC = VR256X +def v32i8x_info : X86VectorVTInfo<32, i8, VR256X, "b">; +def v16i16x_info : X86VectorVTInfo<16, i16, VR256X, "w">; +def v8i32x_info : X86VectorVTInfo<8, i32, VR256X, "d">; +def v4i64x_info : X86VectorVTInfo<4, i64, VR256X, "q">; +def v8f32x_info : X86VectorVTInfo<8, f32, VR256X, "ps">; +def v4f64x_info : X86VectorVTInfo<4, f64, VR256X, "pd">; + +def v16i8x_info : X86VectorVTInfo<16, i8, VR128X, "b">; +def v8i16x_info : X86VectorVTInfo<8, i16, VR128X, "w">; +def v4i32x_info : X86VectorVTInfo<4, i32, VR128X, "d">; +def v2i64x_info : X86VectorVTInfo<2, i64, VR128X, "q">; +def v4f32x_info : X86VectorVTInfo<4, f32, VR128X, "ps">; +def v2f64x_info : X86VectorVTInfo<2, f64, VR128X, "pd">; + +// We map scalar types to the smallest (128-bit) vector type +// with the appropriate element type. This allows to use the same masking logic. +def i32x_info : X86VectorVTInfo<1, i32, GR32, "si">; +def i64x_info : X86VectorVTInfo<1, i64, GR64, "sq">; +def f32x_info : X86VectorVTInfo<1, f32, VR128X, "ss">; +def f64x_info : X86VectorVTInfo<1, f64, VR128X, "sd">; + +class AVX512VLVectorVTInfo<X86VectorVTInfo i512, X86VectorVTInfo i256, + X86VectorVTInfo i128> { + X86VectorVTInfo info512 = i512; + X86VectorVTInfo info256 = i256; + X86VectorVTInfo info128 = i128; +} + +def avx512vl_i8_info : AVX512VLVectorVTInfo<v64i8_info, v32i8x_info, + v16i8x_info>; +def avx512vl_i16_info : AVX512VLVectorVTInfo<v32i16_info, v16i16x_info, + v8i16x_info>; +def avx512vl_i32_info : AVX512VLVectorVTInfo<v16i32_info, v8i32x_info, + v4i32x_info>; +def avx512vl_i64_info : AVX512VLVectorVTInfo<v8i64_info, v4i64x_info, + v2i64x_info>; +def avx512vl_f32_info : AVX512VLVectorVTInfo<v16f32_info, v8f32x_info, + v4f32x_info>; +def avx512vl_f64_info : AVX512VLVectorVTInfo<v8f64_info, v4f64x_info, + v2f64x_info>; + +class X86KVectorVTInfo<RegisterClass _krc, RegisterClass _krcwm, + ValueType _vt> { + RegisterClass KRC = _krc; + RegisterClass KRCWM = _krcwm; + ValueType KVT = _vt; +} + +def v1i1_info : X86KVectorVTInfo<VK1, VK1WM, v1i1>; +def v2i1_info : X86KVectorVTInfo<VK2, VK2WM, v2i1>; +def v4i1_info : X86KVectorVTInfo<VK4, VK4WM, v4i1>; +def v8i1_info : X86KVectorVTInfo<VK8, VK8WM, v8i1>; +def v16i1_info : X86KVectorVTInfo<VK16, VK16WM, v16i1>; +def v32i1_info : X86KVectorVTInfo<VK32, VK32WM, v32i1>; +def v64i1_info : X86KVectorVTInfo<VK64, VK64WM, v64i1>; + +// This multiclass generates the masking variants from the non-masking +// variant. It only provides the assembly pieces for the masking variants. +// It assumes custom ISel patterns for masking which can be provided as +// template arguments. +multiclass AVX512_maskable_custom<bits<8> O, Format F, + dag Outs, + dag Ins, dag MaskingIns, dag ZeroMaskingIns, + string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + list<dag> Pattern, + list<dag> MaskingPattern, + list<dag> ZeroMaskingPattern, + string MaskingConstraint = "", + bit IsCommutable = 0, + bit IsKCommutable = 0, + bit IsKZCommutable = IsCommutable> { + let isCommutable = IsCommutable in + def NAME: AVX512<O, F, Outs, Ins, + OpcodeStr#"\t{"#AttSrcAsm#", $dst|"# + "$dst, "#IntelSrcAsm#"}", + Pattern>; + + // Prefer over VMOV*rrk Pat<> + let isCommutable = IsKCommutable in + def NAME#k: AVX512<O, F, Outs, MaskingIns, + OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"# + "$dst {${mask}}, "#IntelSrcAsm#"}", + MaskingPattern>, + EVEX_K { + // In case of the 3src subclass this is overridden with a let. + string Constraints = MaskingConstraint; + } + + // Zero mask does not add any restrictions to commute operands transformation. + // So, it is Ok to use IsCommutable instead of IsKCommutable. + let isCommutable = IsKZCommutable in // Prefer over VMOV*rrkz Pat<> + def NAME#kz: AVX512<O, F, Outs, ZeroMaskingIns, + OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}} {z}|"# + "$dst {${mask}} {z}, "#IntelSrcAsm#"}", + ZeroMaskingPattern>, + EVEX_KZ; +} + + +// Common base class of AVX512_maskable and AVX512_maskable_3src. +multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _, + dag Outs, + dag Ins, dag MaskingIns, dag ZeroMaskingIns, + string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS, dag MaskingRHS, + SDNode Select = vselect, + string MaskingConstraint = "", + bit IsCommutable = 0, + bit IsKCommutable = 0, + bit IsKZCommutable = IsCommutable> : + AVX512_maskable_custom<O, F, Outs, Ins, MaskingIns, ZeroMaskingIns, OpcodeStr, + AttSrcAsm, IntelSrcAsm, + [(set _.RC:$dst, RHS)], + [(set _.RC:$dst, MaskingRHS)], + [(set _.RC:$dst, + (Select _.KRCWM:$mask, RHS, _.ImmAllZerosV))], + MaskingConstraint, IsCommutable, + IsKCommutable, IsKZCommutable>; + +// This multiclass generates the unconditional/non-masking, the masking and +// the zero-masking variant of the vector instruction. In the masking case, the +// perserved vector elements come from a new dummy input operand tied to $dst. +// This version uses a separate dag for non-masking and masking. +multiclass AVX512_maskable_split<bits<8> O, Format F, X86VectorVTInfo _, + dag Outs, dag Ins, string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS, dag MaskRHS, + bit IsCommutable = 0, bit IsKCommutable = 0, + SDNode Select = vselect> : + AVX512_maskable_custom<O, F, Outs, Ins, + !con((ins _.RC:$src0, _.KRCWM:$mask), Ins), + !con((ins _.KRCWM:$mask), Ins), + OpcodeStr, AttSrcAsm, IntelSrcAsm, + [(set _.RC:$dst, RHS)], + [(set _.RC:$dst, + (Select _.KRCWM:$mask, MaskRHS, _.RC:$src0))], + [(set _.RC:$dst, + (Select _.KRCWM:$mask, MaskRHS, _.ImmAllZerosV))], + "$src0 = $dst", IsCommutable, IsKCommutable>; + +// This multiclass generates the unconditional/non-masking, the masking and +// the zero-masking variant of the vector instruction. In the masking case, the +// perserved vector elements come from a new dummy input operand tied to $dst. +multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _, + dag Outs, dag Ins, string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS, + bit IsCommutable = 0, bit IsKCommutable = 0, + bit IsKZCommutable = IsCommutable, + SDNode Select = vselect> : + AVX512_maskable_common<O, F, _, Outs, Ins, + !con((ins _.RC:$src0, _.KRCWM:$mask), Ins), + !con((ins _.KRCWM:$mask), Ins), + OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, + (Select _.KRCWM:$mask, RHS, _.RC:$src0), + Select, "$src0 = $dst", IsCommutable, IsKCommutable, + IsKZCommutable>; + +// This multiclass generates the unconditional/non-masking, the masking and +// the zero-masking variant of the scalar instruction. +multiclass AVX512_maskable_scalar<bits<8> O, Format F, X86VectorVTInfo _, + dag Outs, dag Ins, string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS, + bit IsCommutable = 0> : + AVX512_maskable<O, F, _, Outs, Ins, OpcodeStr, AttSrcAsm, IntelSrcAsm, + RHS, IsCommutable, 0, IsCommutable, X86selects>; + +// Similar to AVX512_maskable but in this case one of the source operands +// ($src1) is already tied to $dst so we just use that for the preserved +// vector elements. NOTE that the NonTiedIns (the ins dag) should exclude +// $src1. +multiclass AVX512_maskable_3src<bits<8> O, Format F, X86VectorVTInfo _, + dag Outs, dag NonTiedIns, string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS, + bit IsCommutable = 0, + bit IsKCommutable = 0, + SDNode Select = vselect, + bit MaskOnly = 0> : + AVX512_maskable_common<O, F, _, Outs, + !con((ins _.RC:$src1), NonTiedIns), + !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns), + !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns), + OpcodeStr, AttSrcAsm, IntelSrcAsm, + !if(MaskOnly, (null_frag), RHS), + (Select _.KRCWM:$mask, RHS, _.RC:$src1), + Select, "", IsCommutable, IsKCommutable>; + +// Similar to AVX512_maskable_3src but in this case the input VT for the tied +// operand differs from the output VT. This requires a bitconvert on +// the preserved vector going into the vselect. +// NOTE: The unmasked pattern is disabled. +multiclass AVX512_maskable_3src_cast<bits<8> O, Format F, X86VectorVTInfo OutVT, + X86VectorVTInfo InVT, + dag Outs, dag NonTiedIns, string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS, bit IsCommutable = 0> : + AVX512_maskable_common<O, F, OutVT, Outs, + !con((ins InVT.RC:$src1), NonTiedIns), + !con((ins InVT.RC:$src1, InVT.KRCWM:$mask), NonTiedIns), + !con((ins InVT.RC:$src1, InVT.KRCWM:$mask), NonTiedIns), + OpcodeStr, AttSrcAsm, IntelSrcAsm, (null_frag), + (vselect InVT.KRCWM:$mask, RHS, + (bitconvert InVT.RC:$src1)), + vselect, "", IsCommutable>; + +multiclass AVX512_maskable_3src_scalar<bits<8> O, Format F, X86VectorVTInfo _, + dag Outs, dag NonTiedIns, string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS, + bit IsCommutable = 0, + bit IsKCommutable = 0, + bit MaskOnly = 0> : + AVX512_maskable_3src<O, F, _, Outs, NonTiedIns, OpcodeStr, AttSrcAsm, + IntelSrcAsm, RHS, IsCommutable, IsKCommutable, + X86selects, MaskOnly>; + +multiclass AVX512_maskable_in_asm<bits<8> O, Format F, X86VectorVTInfo _, + dag Outs, dag Ins, + string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + list<dag> Pattern> : + AVX512_maskable_custom<O, F, Outs, Ins, + !con((ins _.RC:$src0, _.KRCWM:$mask), Ins), + !con((ins _.KRCWM:$mask), Ins), + OpcodeStr, AttSrcAsm, IntelSrcAsm, Pattern, [], [], + "$src0 = $dst">; + +multiclass AVX512_maskable_3src_in_asm<bits<8> O, Format F, X86VectorVTInfo _, + dag Outs, dag NonTiedIns, + string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + list<dag> Pattern> : + AVX512_maskable_custom<O, F, Outs, + !con((ins _.RC:$src1), NonTiedIns), + !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns), + !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns), + OpcodeStr, AttSrcAsm, IntelSrcAsm, Pattern, [], [], + "">; + +// Instruction with mask that puts result in mask register, +// like "compare" and "vptest" +multiclass AVX512_maskable_custom_cmp<bits<8> O, Format F, + dag Outs, + dag Ins, dag MaskingIns, + string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + list<dag> Pattern, + list<dag> MaskingPattern, + bit IsCommutable = 0> { + let isCommutable = IsCommutable in + def NAME: AVX512<O, F, Outs, Ins, + OpcodeStr#"\t{"#AttSrcAsm#", $dst|"# + "$dst, "#IntelSrcAsm#"}", + Pattern>; + + def NAME#k: AVX512<O, F, Outs, MaskingIns, + OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"# + "$dst {${mask}}, "#IntelSrcAsm#"}", + MaskingPattern>, EVEX_K; +} + +multiclass AVX512_maskable_common_cmp<bits<8> O, Format F, X86VectorVTInfo _, + dag Outs, + dag Ins, dag MaskingIns, + string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS, dag MaskingRHS, + bit IsCommutable = 0> : + AVX512_maskable_custom_cmp<O, F, Outs, Ins, MaskingIns, OpcodeStr, + AttSrcAsm, IntelSrcAsm, + [(set _.KRC:$dst, RHS)], + [(set _.KRC:$dst, MaskingRHS)], IsCommutable>; + +multiclass AVX512_maskable_cmp<bits<8> O, Format F, X86VectorVTInfo _, + dag Outs, dag Ins, string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS, bit IsCommutable = 0> : + AVX512_maskable_common_cmp<O, F, _, Outs, Ins, + !con((ins _.KRCWM:$mask), Ins), + OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, + (and _.KRCWM:$mask, RHS), IsCommutable>; + +multiclass AVX512_maskable_cmp_alt<bits<8> O, Format F, X86VectorVTInfo _, + dag Outs, dag Ins, string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm> : + AVX512_maskable_custom_cmp<O, F, Outs, + Ins, !con((ins _.KRCWM:$mask),Ins), OpcodeStr, + AttSrcAsm, IntelSrcAsm, [], []>; + +// This multiclass generates the unconditional/non-masking, the masking and +// the zero-masking variant of the vector instruction. In the masking case, the +// perserved vector elements come from a new dummy input operand tied to $dst. +multiclass AVX512_maskable_logic<bits<8> O, Format F, X86VectorVTInfo _, + dag Outs, dag Ins, string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS, dag MaskedRHS, + bit IsCommutable = 0, SDNode Select = vselect> : + AVX512_maskable_custom<O, F, Outs, Ins, + !con((ins _.RC:$src0, _.KRCWM:$mask), Ins), + !con((ins _.KRCWM:$mask), Ins), + OpcodeStr, AttSrcAsm, IntelSrcAsm, + [(set _.RC:$dst, RHS)], + [(set _.RC:$dst, + (Select _.KRCWM:$mask, MaskedRHS, _.RC:$src0))], + [(set _.RC:$dst, + (Select _.KRCWM:$mask, MaskedRHS, + _.ImmAllZerosV))], + "$src0 = $dst", IsCommutable>; + + +// Alias instruction that maps zero vector to pxor / xorp* for AVX-512. +// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then +// swizzled by ExecutionDomainFix to pxor. +// We set canFoldAsLoad because this can be converted to a constant-pool +// load of an all-zeros value if folding it would be beneficial. +let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, + isPseudo = 1, Predicates = [HasAVX512], SchedRW = [WriteZero] in { +def AVX512_512_SET0 : I<0, Pseudo, (outs VR512:$dst), (ins), "", + [(set VR512:$dst, (v16i32 immAllZerosV))]>; +def AVX512_512_SETALLONES : I<0, Pseudo, (outs VR512:$dst), (ins), "", + [(set VR512:$dst, (v16i32 immAllOnesV))]>; +} + +// Alias instructions that allow VPTERNLOG to be used with a mask to create +// a mix of all ones and all zeros elements. This is done this way to force +// the same register to be used as input for all three sources. +let isPseudo = 1, Predicates = [HasAVX512], SchedRW = [WriteVecALU] in { +def AVX512_512_SEXT_MASK_32 : I<0, Pseudo, (outs VR512:$dst), + (ins VK16WM:$mask), "", + [(set VR512:$dst, (vselect (v16i1 VK16WM:$mask), + (v16i32 immAllOnesV), + (v16i32 immAllZerosV)))]>; +def AVX512_512_SEXT_MASK_64 : I<0, Pseudo, (outs VR512:$dst), + (ins VK8WM:$mask), "", + [(set VR512:$dst, (vselect (v8i1 VK8WM:$mask), + (bc_v8i64 (v16i32 immAllOnesV)), + (bc_v8i64 (v16i32 immAllZerosV))))]>; +} + +let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, + isPseudo = 1, Predicates = [HasAVX512], SchedRW = [WriteZero] in { +def AVX512_128_SET0 : I<0, Pseudo, (outs VR128X:$dst), (ins), "", + [(set VR128X:$dst, (v4i32 immAllZerosV))]>; +def AVX512_256_SET0 : I<0, Pseudo, (outs VR256X:$dst), (ins), "", + [(set VR256X:$dst, (v8i32 immAllZerosV))]>; +} + +// Alias instructions that map fld0 to xorps for sse or vxorps for avx. +// This is expanded by ExpandPostRAPseudos. +let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, + isPseudo = 1, SchedRW = [WriteZero], Predicates = [HasAVX512] in { + def AVX512_FsFLD0SS : I<0, Pseudo, (outs FR32X:$dst), (ins), "", + [(set FR32X:$dst, fp32imm0)]>; + def AVX512_FsFLD0SD : I<0, Pseudo, (outs FR64X:$dst), (ins), "", + [(set FR64X:$dst, fpimm0)]>; +} + +//===----------------------------------------------------------------------===// +// AVX-512 - VECTOR INSERT +// + +// Supports two different pattern operators for mask and unmasked ops. Allows +// null_frag to be passed for one. +multiclass vinsert_for_size_split<int Opcode, X86VectorVTInfo From, + X86VectorVTInfo To, + SDPatternOperator vinsert_insert, + SDPatternOperator vinsert_for_mask, + X86FoldableSchedWrite sched> { + let hasSideEffects = 0, ExeDomain = To.ExeDomain in { + defm rr : AVX512_maskable_split<Opcode, MRMSrcReg, To, (outs To.RC:$dst), + (ins To.RC:$src1, From.RC:$src2, u8imm:$src3), + "vinsert" # From.EltTypeName # "x" # From.NumElts, + "$src3, $src2, $src1", "$src1, $src2, $src3", + (vinsert_insert:$src3 (To.VT To.RC:$src1), + (From.VT From.RC:$src2), + (iPTR imm)), + (vinsert_for_mask:$src3 (To.VT To.RC:$src1), + (From.VT From.RC:$src2), + (iPTR imm))>, + AVX512AIi8Base, EVEX_4V, Sched<[sched]>; + let mayLoad = 1 in + defm rm : AVX512_maskable_split<Opcode, MRMSrcMem, To, (outs To.RC:$dst), + (ins To.RC:$src1, From.MemOp:$src2, u8imm:$src3), + "vinsert" # From.EltTypeName # "x" # From.NumElts, + "$src3, $src2, $src1", "$src1, $src2, $src3", + (vinsert_insert:$src3 (To.VT To.RC:$src1), + (From.VT (bitconvert (From.LdFrag addr:$src2))), + (iPTR imm)), + (vinsert_for_mask:$src3 (To.VT To.RC:$src1), + (From.VT (bitconvert (From.LdFrag addr:$src2))), + (iPTR imm))>, AVX512AIi8Base, EVEX_4V, + EVEX_CD8<From.EltSize, From.CD8TupleForm>, + Sched<[sched.Folded, ReadAfterLd]>; + } +} + +// Passes the same pattern operator for masked and unmasked ops. +multiclass vinsert_for_size<int Opcode, X86VectorVTInfo From, + X86VectorVTInfo To, + SDPatternOperator vinsert_insert, + X86FoldableSchedWrite sched> : + vinsert_for_size_split<Opcode, From, To, vinsert_insert, vinsert_insert, sched>; + +multiclass vinsert_for_size_lowering<string InstrStr, X86VectorVTInfo From, + X86VectorVTInfo To, PatFrag vinsert_insert, + SDNodeXForm INSERT_get_vinsert_imm , list<Predicate> p> { + let Predicates = p in { + def : Pat<(vinsert_insert:$ins + (To.VT To.RC:$src1), (From.VT From.RC:$src2), (iPTR imm)), + (To.VT (!cast<Instruction>(InstrStr#"rr") + To.RC:$src1, From.RC:$src2, + (INSERT_get_vinsert_imm To.RC:$ins)))>; + + def : Pat<(vinsert_insert:$ins + (To.VT To.RC:$src1), + (From.VT (bitconvert (From.LdFrag addr:$src2))), + (iPTR imm)), + (To.VT (!cast<Instruction>(InstrStr#"rm") + To.RC:$src1, addr:$src2, + (INSERT_get_vinsert_imm To.RC:$ins)))>; + } +} + +multiclass vinsert_for_type<ValueType EltVT32, int Opcode128, + ValueType EltVT64, int Opcode256, + X86FoldableSchedWrite sched> { + + let Predicates = [HasVLX] in + defm NAME # "32x4Z256" : vinsert_for_size<Opcode128, + X86VectorVTInfo< 4, EltVT32, VR128X>, + X86VectorVTInfo< 8, EltVT32, VR256X>, + vinsert128_insert, sched>, EVEX_V256; + + defm NAME # "32x4Z" : vinsert_for_size<Opcode128, + X86VectorVTInfo< 4, EltVT32, VR128X>, + X86VectorVTInfo<16, EltVT32, VR512>, + vinsert128_insert, sched>, EVEX_V512; + + defm NAME # "64x4Z" : vinsert_for_size<Opcode256, + X86VectorVTInfo< 4, EltVT64, VR256X>, + X86VectorVTInfo< 8, EltVT64, VR512>, + vinsert256_insert, sched>, VEX_W, EVEX_V512; + + // Even with DQI we'd like to only use these instructions for masking. + let Predicates = [HasVLX, HasDQI] in + defm NAME # "64x2Z256" : vinsert_for_size_split<Opcode128, + X86VectorVTInfo< 2, EltVT64, VR128X>, + X86VectorVTInfo< 4, EltVT64, VR256X>, + null_frag, vinsert128_insert, sched>, + VEX_W1X, EVEX_V256; + + // Even with DQI we'd like to only use these instructions for masking. + let Predicates = [HasDQI] in { + defm NAME # "64x2Z" : vinsert_for_size_split<Opcode128, + X86VectorVTInfo< 2, EltVT64, VR128X>, + X86VectorVTInfo< 8, EltVT64, VR512>, + null_frag, vinsert128_insert, sched>, + VEX_W, EVEX_V512; + + defm NAME # "32x8Z" : vinsert_for_size_split<Opcode256, + X86VectorVTInfo< 8, EltVT32, VR256X>, + X86VectorVTInfo<16, EltVT32, VR512>, + null_frag, vinsert256_insert, sched>, + EVEX_V512; + } +} + +// FIXME: Is there a better scheduler class for VINSERTF/VINSERTI? +defm VINSERTF : vinsert_for_type<f32, 0x18, f64, 0x1a, WriteFShuffle256>; +defm VINSERTI : vinsert_for_type<i32, 0x38, i64, 0x3a, WriteShuffle256>; + +// Codegen pattern with the alternative types, +// Even with AVX512DQ we'll still use these for unmasked operations. +defm : vinsert_for_size_lowering<"VINSERTF32x4Z256", v2f64x_info, v4f64x_info, + vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>; +defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v2i64x_info, v4i64x_info, + vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>; + +defm : vinsert_for_size_lowering<"VINSERTF32x4Z", v2f64x_info, v8f64_info, + vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>; +defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v2i64x_info, v8i64_info, + vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>; + +defm : vinsert_for_size_lowering<"VINSERTF64x4Z", v8f32x_info, v16f32_info, + vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>; +defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v8i32x_info, v16i32_info, + vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>; + +// Codegen pattern with the alternative types insert VEC128 into VEC256 +defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v8i16x_info, v16i16x_info, + vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>; +defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v16i8x_info, v32i8x_info, + vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>; +// Codegen pattern with the alternative types insert VEC128 into VEC512 +defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v8i16x_info, v32i16_info, + vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>; +defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v16i8x_info, v64i8_info, + vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>; +// Codegen pattern with the alternative types insert VEC256 into VEC512 +defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v16i16x_info, v32i16_info, + vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>; +defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v32i8x_info, v64i8_info, + vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>; + + +multiclass vinsert_for_mask_cast<string InstrStr, X86VectorVTInfo From, + X86VectorVTInfo To, X86VectorVTInfo Cast, + PatFrag vinsert_insert, + SDNodeXForm INSERT_get_vinsert_imm, + list<Predicate> p> { +let Predicates = p in { + def : Pat<(Cast.VT + (vselect Cast.KRCWM:$mask, + (bitconvert + (vinsert_insert:$ins (To.VT To.RC:$src1), + (From.VT From.RC:$src2), + (iPTR imm))), + Cast.RC:$src0)), + (!cast<Instruction>(InstrStr#"rrk") + Cast.RC:$src0, Cast.KRCWM:$mask, To.RC:$src1, From.RC:$src2, + (INSERT_get_vinsert_imm To.RC:$ins))>; + def : Pat<(Cast.VT + (vselect Cast.KRCWM:$mask, + (bitconvert + (vinsert_insert:$ins (To.VT To.RC:$src1), + (From.VT + (bitconvert + (From.LdFrag addr:$src2))), + (iPTR imm))), + Cast.RC:$src0)), + (!cast<Instruction>(InstrStr#"rmk") + Cast.RC:$src0, Cast.KRCWM:$mask, To.RC:$src1, addr:$src2, + (INSERT_get_vinsert_imm To.RC:$ins))>; + + def : Pat<(Cast.VT + (vselect Cast.KRCWM:$mask, + (bitconvert + (vinsert_insert:$ins (To.VT To.RC:$src1), + (From.VT From.RC:$src2), + (iPTR imm))), + Cast.ImmAllZerosV)), + (!cast<Instruction>(InstrStr#"rrkz") + Cast.KRCWM:$mask, To.RC:$src1, From.RC:$src2, + (INSERT_get_vinsert_imm To.RC:$ins))>; + def : Pat<(Cast.VT + (vselect Cast.KRCWM:$mask, + (bitconvert + (vinsert_insert:$ins (To.VT To.RC:$src1), + (From.VT + (bitconvert + (From.LdFrag addr:$src2))), + (iPTR imm))), + Cast.ImmAllZerosV)), + (!cast<Instruction>(InstrStr#"rmkz") + Cast.KRCWM:$mask, To.RC:$src1, addr:$src2, + (INSERT_get_vinsert_imm To.RC:$ins))>; +} +} + +defm : vinsert_for_mask_cast<"VINSERTF32x4Z256", v2f64x_info, v4f64x_info, + v8f32x_info, vinsert128_insert, + INSERT_get_vinsert128_imm, [HasVLX]>; +defm : vinsert_for_mask_cast<"VINSERTF64x2Z256", v4f32x_info, v8f32x_info, + v4f64x_info, vinsert128_insert, + INSERT_get_vinsert128_imm, [HasDQI, HasVLX]>; + +defm : vinsert_for_mask_cast<"VINSERTI32x4Z256", v2i64x_info, v4i64x_info, + v8i32x_info, vinsert128_insert, + INSERT_get_vinsert128_imm, [HasVLX]>; +defm : vinsert_for_mask_cast<"VINSERTI32x4Z256", v8i16x_info, v16i16x_info, + v8i32x_info, vinsert128_insert, + INSERT_get_vinsert128_imm, [HasVLX]>; +defm : vinsert_for_mask_cast<"VINSERTI32x4Z256", v16i8x_info, v32i8x_info, + v8i32x_info, vinsert128_insert, + INSERT_get_vinsert128_imm, [HasVLX]>; +defm : vinsert_for_mask_cast<"VINSERTF64x2Z256", v4i32x_info, v8i32x_info, + v4i64x_info, vinsert128_insert, + INSERT_get_vinsert128_imm, [HasDQI, HasVLX]>; +defm : vinsert_for_mask_cast<"VINSERTF64x2Z256", v8i16x_info, v16i16x_info, + v4i64x_info, vinsert128_insert, + INSERT_get_vinsert128_imm, [HasDQI, HasVLX]>; +defm : vinsert_for_mask_cast<"VINSERTF64x2Z256", v16i8x_info, v32i8x_info, + v4i64x_info, vinsert128_insert, + INSERT_get_vinsert128_imm, [HasDQI, HasVLX]>; + +defm : vinsert_for_mask_cast<"VINSERTF32x4Z", v2f64x_info, v8f64_info, + v16f32_info, vinsert128_insert, + INSERT_get_vinsert128_imm, [HasAVX512]>; +defm : vinsert_for_mask_cast<"VINSERTF64x2Z", v4f32x_info, v16f32_info, + v8f64_info, vinsert128_insert, + INSERT_get_vinsert128_imm, [HasDQI]>; + +defm : vinsert_for_mask_cast<"VINSERTI32x4Z", v2i64x_info, v8i64_info, + v16i32_info, vinsert128_insert, + INSERT_get_vinsert128_imm, [HasAVX512]>; +defm : vinsert_for_mask_cast<"VINSERTI32x4Z", v8i16x_info, v32i16_info, + v16i32_info, vinsert128_insert, + INSERT_get_vinsert128_imm, [HasAVX512]>; +defm : vinsert_for_mask_cast<"VINSERTI32x4Z", v16i8x_info, v64i8_info, + v16i32_info, vinsert128_insert, + INSERT_get_vinsert128_imm, [HasAVX512]>; +defm : vinsert_for_mask_cast<"VINSERTI64x2Z", v4i32x_info, v16i32_info, + v8i64_info, vinsert128_insert, + INSERT_get_vinsert128_imm, [HasDQI]>; +defm : vinsert_for_mask_cast<"VINSERTI64x2Z", v8i16x_info, v32i16_info, + v8i64_info, vinsert128_insert, + INSERT_get_vinsert128_imm, [HasDQI]>; +defm : vinsert_for_mask_cast<"VINSERTI64x2Z", v16i8x_info, v64i8_info, + v8i64_info, vinsert128_insert, + INSERT_get_vinsert128_imm, [HasDQI]>; + +defm : vinsert_for_mask_cast<"VINSERTF32x8Z", v4f64x_info, v8f64_info, + v16f32_info, vinsert256_insert, + INSERT_get_vinsert256_imm, [HasDQI]>; +defm : vinsert_for_mask_cast<"VINSERTF64x4Z", v8f32x_info, v16f32_info, + v8f64_info, vinsert256_insert, + INSERT_get_vinsert256_imm, [HasAVX512]>; + +defm : vinsert_for_mask_cast<"VINSERTI32x8Z", v4i64x_info, v8i64_info, + v16i32_info, vinsert256_insert, + INSERT_get_vinsert256_imm, [HasDQI]>; +defm : vinsert_for_mask_cast<"VINSERTI32x8Z", v16i16x_info, v32i16_info, + v16i32_info, vinsert256_insert, + INSERT_get_vinsert256_imm, [HasDQI]>; +defm : vinsert_for_mask_cast<"VINSERTI32x8Z", v32i8x_info, v64i8_info, + v16i32_info, vinsert256_insert, + INSERT_get_vinsert256_imm, [HasDQI]>; +defm : vinsert_for_mask_cast<"VINSERTI64x4Z", v8i32x_info, v16i32_info, + v8i64_info, vinsert256_insert, + INSERT_get_vinsert256_imm, [HasAVX512]>; +defm : vinsert_for_mask_cast<"VINSERTI64x4Z", v16i16x_info, v32i16_info, + v8i64_info, vinsert256_insert, + INSERT_get_vinsert256_imm, [HasAVX512]>; +defm : vinsert_for_mask_cast<"VINSERTI64x4Z", v32i8x_info, v64i8_info, + v8i64_info, vinsert256_insert, + INSERT_get_vinsert256_imm, [HasAVX512]>; + +// vinsertps - insert f32 to XMM +let ExeDomain = SSEPackedSingle in { +def VINSERTPSZrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst), + (ins VR128X:$src1, VR128X:$src2, u8imm:$src3), + "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, imm:$src3))]>, + EVEX_4V, Sched<[SchedWriteFShuffle.XMM]>; +def VINSERTPSZrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst), + (ins VR128X:$src1, f32mem:$src2, u8imm:$src3), + "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set VR128X:$dst, (X86insertps VR128X:$src1, + (v4f32 (scalar_to_vector (loadf32 addr:$src2))), + imm:$src3))]>, + EVEX_4V, EVEX_CD8<32, CD8VT1>, + Sched<[SchedWriteFShuffle.XMM.Folded, ReadAfterLd]>; +} + +//===----------------------------------------------------------------------===// +// AVX-512 VECTOR EXTRACT +//--- + +// Supports two different pattern operators for mask and unmasked ops. Allows +// null_frag to be passed for one. +multiclass vextract_for_size_split<int Opcode, + X86VectorVTInfo From, X86VectorVTInfo To, + SDPatternOperator vextract_extract, + SDPatternOperator vextract_for_mask, + SchedWrite SchedRR, SchedWrite SchedMR> { + + let hasSideEffects = 0, ExeDomain = To.ExeDomain in { + defm rr : AVX512_maskable_split<Opcode, MRMDestReg, To, (outs To.RC:$dst), + (ins From.RC:$src1, u8imm:$idx), + "vextract" # To.EltTypeName # "x" # To.NumElts, + "$idx, $src1", "$src1, $idx", + (vextract_extract:$idx (From.VT From.RC:$src1), (iPTR imm)), + (vextract_for_mask:$idx (From.VT From.RC:$src1), (iPTR imm))>, + AVX512AIi8Base, EVEX, Sched<[SchedRR]>; + + def mr : AVX512AIi8<Opcode, MRMDestMem, (outs), + (ins To.MemOp:$dst, From.RC:$src1, u8imm:$idx), + "vextract" # To.EltTypeName # "x" # To.NumElts # + "\t{$idx, $src1, $dst|$dst, $src1, $idx}", + [(store (To.VT (vextract_extract:$idx + (From.VT From.RC:$src1), (iPTR imm))), + addr:$dst)]>, EVEX, + Sched<[SchedMR]>; + + let mayStore = 1, hasSideEffects = 0 in + def mrk : AVX512AIi8<Opcode, MRMDestMem, (outs), + (ins To.MemOp:$dst, To.KRCWM:$mask, + From.RC:$src1, u8imm:$idx), + "vextract" # To.EltTypeName # "x" # To.NumElts # + "\t{$idx, $src1, $dst {${mask}}|" + "$dst {${mask}}, $src1, $idx}", []>, + EVEX_K, EVEX, Sched<[SchedMR]>, NotMemoryFoldable; + } +} + +// Passes the same pattern operator for masked and unmasked ops. +multiclass vextract_for_size<int Opcode, X86VectorVTInfo From, + X86VectorVTInfo To, + SDPatternOperator vextract_extract, + SchedWrite SchedRR, SchedWrite SchedMR> : + vextract_for_size_split<Opcode, From, To, vextract_extract, vextract_extract, SchedRR, SchedMR>; + +// Codegen pattern for the alternative types +multiclass vextract_for_size_lowering<string InstrStr, X86VectorVTInfo From, + X86VectorVTInfo To, PatFrag vextract_extract, + SDNodeXForm EXTRACT_get_vextract_imm, list<Predicate> p> { + let Predicates = p in { + def : Pat<(vextract_extract:$ext (From.VT From.RC:$src1), (iPTR imm)), + (To.VT (!cast<Instruction>(InstrStr#"rr") + From.RC:$src1, + (EXTRACT_get_vextract_imm To.RC:$ext)))>; + def : Pat<(store (To.VT (vextract_extract:$ext (From.VT From.RC:$src1), + (iPTR imm))), addr:$dst), + (!cast<Instruction>(InstrStr#"mr") addr:$dst, From.RC:$src1, + (EXTRACT_get_vextract_imm To.RC:$ext))>; + } +} + +multiclass vextract_for_type<ValueType EltVT32, int Opcode128, + ValueType EltVT64, int Opcode256, + SchedWrite SchedRR, SchedWrite SchedMR> { + let Predicates = [HasAVX512] in { + defm NAME # "32x4Z" : vextract_for_size<Opcode128, + X86VectorVTInfo<16, EltVT32, VR512>, + X86VectorVTInfo< 4, EltVT32, VR128X>, + vextract128_extract, SchedRR, SchedMR>, + EVEX_V512, EVEX_CD8<32, CD8VT4>; + defm NAME # "64x4Z" : vextract_for_size<Opcode256, + X86VectorVTInfo< 8, EltVT64, VR512>, + X86VectorVTInfo< 4, EltVT64, VR256X>, + vextract256_extract, SchedRR, SchedMR>, + VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT4>; + } + let Predicates = [HasVLX] in + defm NAME # "32x4Z256" : vextract_for_size<Opcode128, + X86VectorVTInfo< 8, EltVT32, VR256X>, + X86VectorVTInfo< 4, EltVT32, VR128X>, + vextract128_extract, SchedRR, SchedMR>, + EVEX_V256, EVEX_CD8<32, CD8VT4>; + + // Even with DQI we'd like to only use these instructions for masking. + let Predicates = [HasVLX, HasDQI] in + defm NAME # "64x2Z256" : vextract_for_size_split<Opcode128, + X86VectorVTInfo< 4, EltVT64, VR256X>, + X86VectorVTInfo< 2, EltVT64, VR128X>, + null_frag, vextract128_extract, SchedRR, SchedMR>, + VEX_W1X, EVEX_V256, EVEX_CD8<64, CD8VT2>; + + // Even with DQI we'd like to only use these instructions for masking. + let Predicates = [HasDQI] in { + defm NAME # "64x2Z" : vextract_for_size_split<Opcode128, + X86VectorVTInfo< 8, EltVT64, VR512>, + X86VectorVTInfo< 2, EltVT64, VR128X>, + null_frag, vextract128_extract, SchedRR, SchedMR>, + VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT2>; + defm NAME # "32x8Z" : vextract_for_size_split<Opcode256, + X86VectorVTInfo<16, EltVT32, VR512>, + X86VectorVTInfo< 8, EltVT32, VR256X>, + null_frag, vextract256_extract, SchedRR, SchedMR>, + EVEX_V512, EVEX_CD8<32, CD8VT8>; + } +} + +// TODO - replace WriteFStore/WriteVecStore with X86SchedWriteMoveLSWidths types. +defm VEXTRACTF : vextract_for_type<f32, 0x19, f64, 0x1b, WriteFShuffle256, WriteFStore>; +defm VEXTRACTI : vextract_for_type<i32, 0x39, i64, 0x3b, WriteShuffle256, WriteVecStore>; + +// extract_subvector codegen patterns with the alternative types. +// Even with AVX512DQ we'll still use these for unmasked operations. +defm : vextract_for_size_lowering<"VEXTRACTF32x4Z", v8f64_info, v2f64x_info, + vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>; +defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v8i64_info, v2i64x_info, + vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>; + +defm : vextract_for_size_lowering<"VEXTRACTF64x4Z", v16f32_info, v8f32x_info, + vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>; +defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v16i32_info, v8i32x_info, + vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>; + +defm : vextract_for_size_lowering<"VEXTRACTF32x4Z256", v4f64x_info, v2f64x_info, + vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>; +defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v4i64x_info, v2i64x_info, + vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>; + +// Codegen pattern with the alternative types extract VEC128 from VEC256 +defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v16i16x_info, v8i16x_info, + vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>; +defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v32i8x_info, v16i8x_info, + vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>; + +// Codegen pattern with the alternative types extract VEC128 from VEC512 +defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v32i16_info, v8i16x_info, + vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>; +defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v64i8_info, v16i8x_info, + vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>; +// Codegen pattern with the alternative types extract VEC256 from VEC512 +defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v32i16_info, v16i16x_info, + vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>; +defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v64i8_info, v32i8x_info, + vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>; + + +// A 128-bit extract from bits [255:128] of a 512-bit vector should use a +// smaller extract to enable EVEX->VEX. +let Predicates = [NoVLX] in { +def : Pat<(v2i64 (extract_subvector (v8i64 VR512:$src), (iPTR 2))), + (v2i64 (VEXTRACTI128rr + (v4i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_ymm)), + (iPTR 1)))>; +def : Pat<(v2f64 (extract_subvector (v8f64 VR512:$src), (iPTR 2))), + (v2f64 (VEXTRACTF128rr + (v4f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_ymm)), + (iPTR 1)))>; +def : Pat<(v4i32 (extract_subvector (v16i32 VR512:$src), (iPTR 4))), + (v4i32 (VEXTRACTI128rr + (v8i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_ymm)), + (iPTR 1)))>; +def : Pat<(v4f32 (extract_subvector (v16f32 VR512:$src), (iPTR 4))), + (v4f32 (VEXTRACTF128rr + (v8f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_ymm)), + (iPTR 1)))>; +def : Pat<(v8i16 (extract_subvector (v32i16 VR512:$src), (iPTR 8))), + (v8i16 (VEXTRACTI128rr + (v16i16 (EXTRACT_SUBREG (v32i16 VR512:$src), sub_ymm)), + (iPTR 1)))>; +def : Pat<(v16i8 (extract_subvector (v64i8 VR512:$src), (iPTR 16))), + (v16i8 (VEXTRACTI128rr + (v32i8 (EXTRACT_SUBREG (v64i8 VR512:$src), sub_ymm)), + (iPTR 1)))>; +} + +// A 128-bit extract from bits [255:128] of a 512-bit vector should use a +// smaller extract to enable EVEX->VEX. +let Predicates = [HasVLX] in { +def : Pat<(v2i64 (extract_subvector (v8i64 VR512:$src), (iPTR 2))), + (v2i64 (VEXTRACTI32x4Z256rr + (v4i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_ymm)), + (iPTR 1)))>; +def : Pat<(v2f64 (extract_subvector (v8f64 VR512:$src), (iPTR 2))), + (v2f64 (VEXTRACTF32x4Z256rr + (v4f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_ymm)), + (iPTR 1)))>; +def : Pat<(v4i32 (extract_subvector (v16i32 VR512:$src), (iPTR 4))), + (v4i32 (VEXTRACTI32x4Z256rr + (v8i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_ymm)), + (iPTR 1)))>; +def : Pat<(v4f32 (extract_subvector (v16f32 VR512:$src), (iPTR 4))), + (v4f32 (VEXTRACTF32x4Z256rr + (v8f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_ymm)), + (iPTR 1)))>; +def : Pat<(v8i16 (extract_subvector (v32i16 VR512:$src), (iPTR 8))), + (v8i16 (VEXTRACTI32x4Z256rr + (v16i16 (EXTRACT_SUBREG (v32i16 VR512:$src), sub_ymm)), + (iPTR 1)))>; +def : Pat<(v16i8 (extract_subvector (v64i8 VR512:$src), (iPTR 16))), + (v16i8 (VEXTRACTI32x4Z256rr + (v32i8 (EXTRACT_SUBREG (v64i8 VR512:$src), sub_ymm)), + (iPTR 1)))>; +} + + +// Additional patterns for handling a bitcast between the vselect and the +// extract_subvector. +multiclass vextract_for_mask_cast<string InstrStr, X86VectorVTInfo From, + X86VectorVTInfo To, X86VectorVTInfo Cast, + PatFrag vextract_extract, + SDNodeXForm EXTRACT_get_vextract_imm, + list<Predicate> p> { +let Predicates = p in { + def : Pat<(Cast.VT (vselect Cast.KRCWM:$mask, + (bitconvert + (To.VT (vextract_extract:$ext + (From.VT From.RC:$src), (iPTR imm)))), + To.RC:$src0)), + (Cast.VT (!cast<Instruction>(InstrStr#"rrk") + Cast.RC:$src0, Cast.KRCWM:$mask, From.RC:$src, + (EXTRACT_get_vextract_imm To.RC:$ext)))>; + + def : Pat<(Cast.VT (vselect Cast.KRCWM:$mask, + (bitconvert + (To.VT (vextract_extract:$ext + (From.VT From.RC:$src), (iPTR imm)))), + Cast.ImmAllZerosV)), + (Cast.VT (!cast<Instruction>(InstrStr#"rrkz") + Cast.KRCWM:$mask, From.RC:$src, + (EXTRACT_get_vextract_imm To.RC:$ext)))>; +} +} + +defm : vextract_for_mask_cast<"VEXTRACTF32x4Z256", v4f64x_info, v2f64x_info, + v4f32x_info, vextract128_extract, + EXTRACT_get_vextract128_imm, [HasVLX]>; +defm : vextract_for_mask_cast<"VEXTRACTF64x2Z256", v8f32x_info, v4f32x_info, + v2f64x_info, vextract128_extract, + EXTRACT_get_vextract128_imm, [HasDQI, HasVLX]>; + +defm : vextract_for_mask_cast<"VEXTRACTI32x4Z256", v4i64x_info, v2i64x_info, + v4i32x_info, vextract128_extract, + EXTRACT_get_vextract128_imm, [HasVLX]>; +defm : vextract_for_mask_cast<"VEXTRACTI32x4Z256", v16i16x_info, v8i16x_info, + v4i32x_info, vextract128_extract, + EXTRACT_get_vextract128_imm, [HasVLX]>; +defm : vextract_for_mask_cast<"VEXTRACTI32x4Z256", v32i8x_info, v16i8x_info, + v4i32x_info, vextract128_extract, + EXTRACT_get_vextract128_imm, [HasVLX]>; +defm : vextract_for_mask_cast<"VEXTRACTI64x2Z256", v8i32x_info, v4i32x_info, + v2i64x_info, vextract128_extract, + EXTRACT_get_vextract128_imm, [HasDQI, HasVLX]>; +defm : vextract_for_mask_cast<"VEXTRACTI64x2Z256", v16i16x_info, v8i16x_info, + v2i64x_info, vextract128_extract, + EXTRACT_get_vextract128_imm, [HasDQI, HasVLX]>; +defm : vextract_for_mask_cast<"VEXTRACTI64x2Z256", v32i8x_info, v16i8x_info, + v2i64x_info, vextract128_extract, + EXTRACT_get_vextract128_imm, [HasDQI, HasVLX]>; + +defm : vextract_for_mask_cast<"VEXTRACTF32x4Z", v8f64_info, v2f64x_info, + v4f32x_info, vextract128_extract, + EXTRACT_get_vextract128_imm, [HasAVX512]>; +defm : vextract_for_mask_cast<"VEXTRACTF64x2Z", v16f32_info, v4f32x_info, + v2f64x_info, vextract128_extract, + EXTRACT_get_vextract128_imm, [HasDQI]>; + +defm : vextract_for_mask_cast<"VEXTRACTI32x4Z", v8i64_info, v2i64x_info, + v4i32x_info, vextract128_extract, + EXTRACT_get_vextract128_imm, [HasAVX512]>; +defm : vextract_for_mask_cast<"VEXTRACTI32x4Z", v32i16_info, v8i16x_info, + v4i32x_info, vextract128_extract, + EXTRACT_get_vextract128_imm, [HasAVX512]>; +defm : vextract_for_mask_cast<"VEXTRACTI32x4Z", v64i8_info, v16i8x_info, + v4i32x_info, vextract128_extract, + EXTRACT_get_vextract128_imm, [HasAVX512]>; +defm : vextract_for_mask_cast<"VEXTRACTI64x2Z", v16i32_info, v4i32x_info, + v2i64x_info, vextract128_extract, + EXTRACT_get_vextract128_imm, [HasDQI]>; +defm : vextract_for_mask_cast<"VEXTRACTI64x2Z", v32i16_info, v8i16x_info, + v2i64x_info, vextract128_extract, + EXTRACT_get_vextract128_imm, [HasDQI]>; +defm : vextract_for_mask_cast<"VEXTRACTI64x2Z", v64i8_info, v16i8x_info, + v2i64x_info, vextract128_extract, + EXTRACT_get_vextract128_imm, [HasDQI]>; + +defm : vextract_for_mask_cast<"VEXTRACTF32x8Z", v8f64_info, v4f64x_info, + v8f32x_info, vextract256_extract, + EXTRACT_get_vextract256_imm, [HasDQI]>; +defm : vextract_for_mask_cast<"VEXTRACTF64x4Z", v16f32_info, v8f32x_info, + v4f64x_info, vextract256_extract, + EXTRACT_get_vextract256_imm, [HasAVX512]>; + +defm : vextract_for_mask_cast<"VEXTRACTI32x8Z", v8i64_info, v4i64x_info, + v8i32x_info, vextract256_extract, + EXTRACT_get_vextract256_imm, [HasDQI]>; +defm : vextract_for_mask_cast<"VEXTRACTI32x8Z", v32i16_info, v16i16x_info, + v8i32x_info, vextract256_extract, + EXTRACT_get_vextract256_imm, [HasDQI]>; +defm : vextract_for_mask_cast<"VEXTRACTI32x8Z", v64i8_info, v32i8x_info, + v8i32x_info, vextract256_extract, + EXTRACT_get_vextract256_imm, [HasDQI]>; +defm : vextract_for_mask_cast<"VEXTRACTI64x4Z", v16i32_info, v8i32x_info, + v4i64x_info, vextract256_extract, + EXTRACT_get_vextract256_imm, [HasAVX512]>; +defm : vextract_for_mask_cast<"VEXTRACTI64x4Z", v32i16_info, v16i16x_info, + v4i64x_info, vextract256_extract, + EXTRACT_get_vextract256_imm, [HasAVX512]>; +defm : vextract_for_mask_cast<"VEXTRACTI64x4Z", v64i8_info, v32i8x_info, + v4i64x_info, vextract256_extract, + EXTRACT_get_vextract256_imm, [HasAVX512]>; + +// vextractps - extract 32 bits from XMM +def VEXTRACTPSZrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32:$dst), + (ins VR128X:$src1, u8imm:$src2), + "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32:$dst, (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2))]>, + EVEX, VEX_WIG, Sched<[WriteVecExtract]>; + +def VEXTRACTPSZmr : AVX512AIi8<0x17, MRMDestMem, (outs), + (ins f32mem:$dst, VR128X:$src1, u8imm:$src2), + "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(store (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2), + addr:$dst)]>, + EVEX, VEX_WIG, EVEX_CD8<32, CD8VT1>, Sched<[WriteVecExtractSt]>; + +//===---------------------------------------------------------------------===// +// AVX-512 BROADCAST +//--- +// broadcast with a scalar argument. +multiclass avx512_broadcast_scalar<bits<8> opc, string OpcodeStr, + string Name, + X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo> { + def : Pat<(DestInfo.VT (X86VBroadcast SrcInfo.FRC:$src)), + (!cast<Instruction>(Name#DestInfo.ZSuffix#r) + (SrcInfo.VT (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC)))>; + def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask, + (X86VBroadcast SrcInfo.FRC:$src), + DestInfo.RC:$src0)), + (!cast<Instruction>(Name#DestInfo.ZSuffix#rk) + DestInfo.RC:$src0, DestInfo.KRCWM:$mask, + (SrcInfo.VT (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC)))>; + def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask, + (X86VBroadcast SrcInfo.FRC:$src), + DestInfo.ImmAllZerosV)), + (!cast<Instruction>(Name#DestInfo.ZSuffix#rkz) + DestInfo.KRCWM:$mask, (SrcInfo.VT (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC)))>; +} + +// Split version to allow mask and broadcast node to be different types. This +// helps support the 32x2 broadcasts. +multiclass avx512_broadcast_rm_split<bits<8> opc, string OpcodeStr, + string Name, + SchedWrite SchedRR, SchedWrite SchedRM, + X86VectorVTInfo MaskInfo, + X86VectorVTInfo DestInfo, + X86VectorVTInfo SrcInfo, + SDPatternOperator UnmaskedOp = X86VBroadcast> { + let ExeDomain = DestInfo.ExeDomain, hasSideEffects = 0 in { + defm r : AVX512_maskable_split<opc, MRMSrcReg, MaskInfo, + (outs MaskInfo.RC:$dst), + (ins SrcInfo.RC:$src), OpcodeStr, "$src", "$src", + (MaskInfo.VT + (bitconvert + (DestInfo.VT + (UnmaskedOp (SrcInfo.VT SrcInfo.RC:$src))))), + (MaskInfo.VT + (bitconvert + (DestInfo.VT + (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src)))))>, + T8PD, EVEX, Sched<[SchedRR]>; + let mayLoad = 1 in + defm m : AVX512_maskable_split<opc, MRMSrcMem, MaskInfo, + (outs MaskInfo.RC:$dst), + (ins SrcInfo.ScalarMemOp:$src), OpcodeStr, "$src", "$src", + (MaskInfo.VT + (bitconvert + (DestInfo.VT (UnmaskedOp + (SrcInfo.ScalarLdFrag addr:$src))))), + (MaskInfo.VT + (bitconvert + (DestInfo.VT (X86VBroadcast + (SrcInfo.ScalarLdFrag addr:$src)))))>, + T8PD, EVEX, EVEX_CD8<SrcInfo.EltSize, CD8VT1>, + Sched<[SchedRM]>; + } + + def : Pat<(MaskInfo.VT + (bitconvert + (DestInfo.VT (UnmaskedOp + (SrcInfo.VT (scalar_to_vector + (SrcInfo.ScalarLdFrag addr:$src))))))), + (!cast<Instruction>(Name#MaskInfo.ZSuffix#m) addr:$src)>; + def : Pat<(MaskInfo.VT (vselect MaskInfo.KRCWM:$mask, + (bitconvert + (DestInfo.VT + (X86VBroadcast + (SrcInfo.VT (scalar_to_vector + (SrcInfo.ScalarLdFrag addr:$src)))))), + MaskInfo.RC:$src0)), + (!cast<Instruction>(Name#DestInfo.ZSuffix#mk) + MaskInfo.RC:$src0, MaskInfo.KRCWM:$mask, addr:$src)>; + def : Pat<(MaskInfo.VT (vselect MaskInfo.KRCWM:$mask, + (bitconvert + (DestInfo.VT + (X86VBroadcast + (SrcInfo.VT (scalar_to_vector + (SrcInfo.ScalarLdFrag addr:$src)))))), + MaskInfo.ImmAllZerosV)), + (!cast<Instruction>(Name#MaskInfo.ZSuffix#mkz) + MaskInfo.KRCWM:$mask, addr:$src)>; +} + +// Helper class to force mask and broadcast result to same type. +multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr, string Name, + SchedWrite SchedRR, SchedWrite SchedRM, + X86VectorVTInfo DestInfo, + X86VectorVTInfo SrcInfo> : + avx512_broadcast_rm_split<opc, OpcodeStr, Name, SchedRR, SchedRM, + DestInfo, DestInfo, SrcInfo>; + +multiclass avx512_fp_broadcast_sd<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo _> { + let Predicates = [HasAVX512] in { + defm Z : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256, + WriteFShuffle256Ld, _.info512, _.info128>, + avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info512, + _.info128>, + EVEX_V512; + } + + let Predicates = [HasVLX] in { + defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256, + WriteFShuffle256Ld, _.info256, _.info128>, + avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info256, + _.info128>, + EVEX_V256; + } +} + +multiclass avx512_fp_broadcast_ss<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo _> { + let Predicates = [HasAVX512] in { + defm Z : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256, + WriteFShuffle256Ld, _.info512, _.info128>, + avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info512, + _.info128>, + EVEX_V512; + } + + let Predicates = [HasVLX] in { + defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256, + WriteFShuffle256Ld, _.info256, _.info128>, + avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info256, + _.info128>, + EVEX_V256; + defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteFShuffle256, + WriteFShuffle256Ld, _.info128, _.info128>, + avx512_broadcast_scalar<opc, OpcodeStr, NAME, _.info128, + _.info128>, + EVEX_V128; + } +} +defm VBROADCASTSS : avx512_fp_broadcast_ss<0x18, "vbroadcastss", + avx512vl_f32_info>; +defm VBROADCASTSD : avx512_fp_broadcast_sd<0x19, "vbroadcastsd", + avx512vl_f64_info>, VEX_W1X; + +multiclass avx512_int_broadcast_reg<bits<8> opc, SchedWrite SchedRR, + X86VectorVTInfo _, SDPatternOperator OpNode, + RegisterClass SrcRC> { + let ExeDomain = _.ExeDomain in + defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins SrcRC:$src), + "vpbroadcast"##_.Suffix, "$src", "$src", + (_.VT (OpNode SrcRC:$src))>, T8PD, EVEX, + Sched<[SchedRR]>; +} + +multiclass avx512_int_broadcastbw_reg<bits<8> opc, string Name, SchedWrite SchedRR, + X86VectorVTInfo _, SDPatternOperator OpNode, + RegisterClass SrcRC, SubRegIndex Subreg> { + let hasSideEffects = 0, ExeDomain = _.ExeDomain in + defm r : AVX512_maskable_custom<opc, MRMSrcReg, + (outs _.RC:$dst), (ins GR32:$src), + !con((ins _.RC:$src0, _.KRCWM:$mask), (ins GR32:$src)), + !con((ins _.KRCWM:$mask), (ins GR32:$src)), + "vpbroadcast"##_.Suffix, "$src", "$src", [], [], [], + "$src0 = $dst">, T8PD, EVEX, Sched<[SchedRR]>; + + def : Pat <(_.VT (OpNode SrcRC:$src)), + (!cast<Instruction>(Name#r) + (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), SrcRC:$src, Subreg)))>; + + def : Pat <(vselect _.KRCWM:$mask, (_.VT (OpNode SrcRC:$src)), _.RC:$src0), + (!cast<Instruction>(Name#rk) _.RC:$src0, _.KRCWM:$mask, + (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), SrcRC:$src, Subreg)))>; + + def : Pat <(vselect _.KRCWM:$mask, (_.VT (OpNode SrcRC:$src)), _.ImmAllZerosV), + (!cast<Instruction>(Name#rkz) _.KRCWM:$mask, + (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), SrcRC:$src, Subreg)))>; +} + +multiclass avx512_int_broadcastbw_reg_vl<bits<8> opc, string Name, + AVX512VLVectorVTInfo _, SDPatternOperator OpNode, + RegisterClass SrcRC, SubRegIndex Subreg, Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_int_broadcastbw_reg<opc, Name#Z, WriteShuffle256, _.info512, + OpNode, SrcRC, Subreg>, EVEX_V512; + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_int_broadcastbw_reg<opc, Name#Z256, WriteShuffle256, + _.info256, OpNode, SrcRC, Subreg>, EVEX_V256; + defm Z128 : avx512_int_broadcastbw_reg<opc, Name#Z128, WriteShuffle, + _.info128, OpNode, SrcRC, Subreg>, EVEX_V128; + } +} + +multiclass avx512_int_broadcast_reg_vl<bits<8> opc, AVX512VLVectorVTInfo _, + SDPatternOperator OpNode, + RegisterClass SrcRC, Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_int_broadcast_reg<opc, WriteShuffle256, _.info512, OpNode, + SrcRC>, EVEX_V512; + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_int_broadcast_reg<opc, WriteShuffle256, _.info256, OpNode, + SrcRC>, EVEX_V256; + defm Z128 : avx512_int_broadcast_reg<opc, WriteShuffle, _.info128, OpNode, + SrcRC>, EVEX_V128; + } +} + +defm VPBROADCASTBr : avx512_int_broadcastbw_reg_vl<0x7A, "VPBROADCASTBr", + avx512vl_i8_info, X86VBroadcast, GR8, sub_8bit, HasBWI>; +defm VPBROADCASTWr : avx512_int_broadcastbw_reg_vl<0x7B, "VPBROADCASTWr", + avx512vl_i16_info, X86VBroadcast, GR16, sub_16bit, + HasBWI>; +defm VPBROADCASTDr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i32_info, + X86VBroadcast, GR32, HasAVX512>; +defm VPBROADCASTQr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i64_info, + X86VBroadcast, GR64, HasAVX512>, VEX_W; + +// Provide aliases for broadcast from the same register class that +// automatically does the extract. +multiclass avx512_int_broadcast_rm_lowering<string Name, + X86VectorVTInfo DestInfo, + X86VectorVTInfo SrcInfo, + X86VectorVTInfo ExtInfo> { + def : Pat<(DestInfo.VT (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))), + (!cast<Instruction>(Name#DestInfo.ZSuffix#"r") + (ExtInfo.VT (EXTRACT_SUBREG (SrcInfo.VT SrcInfo.RC:$src), sub_xmm)))>; +} + +multiclass avx512_int_broadcast_rm_vl<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo _, Predicate prd> { + let Predicates = [prd] in { + defm Z : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteShuffle256, + WriteShuffle256Ld, _.info512, _.info128>, + avx512_int_broadcast_rm_lowering<NAME, _.info512, _.info256, _.info128>, + EVEX_V512; + // Defined separately to avoid redefinition. + defm Z_Alt : avx512_int_broadcast_rm_lowering<NAME, _.info512, _.info512, _.info128>; + } + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteShuffle256, + WriteShuffle256Ld, _.info256, _.info128>, + avx512_int_broadcast_rm_lowering<NAME, _.info256, _.info256, _.info128>, + EVEX_V256; + defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, NAME, WriteShuffle, + WriteShuffleXLd, _.info128, _.info128>, + EVEX_V128; + } +} + +defm VPBROADCASTB : avx512_int_broadcast_rm_vl<0x78, "vpbroadcastb", + avx512vl_i8_info, HasBWI>; +defm VPBROADCASTW : avx512_int_broadcast_rm_vl<0x79, "vpbroadcastw", + avx512vl_i16_info, HasBWI>; +defm VPBROADCASTD : avx512_int_broadcast_rm_vl<0x58, "vpbroadcastd", + avx512vl_i32_info, HasAVX512>; +defm VPBROADCASTQ : avx512_int_broadcast_rm_vl<0x59, "vpbroadcastq", + avx512vl_i64_info, HasAVX512>, VEX_W1X; + +multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr, + X86VectorVTInfo _Dst, X86VectorVTInfo _Src> { + defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), + (ins _Src.MemOp:$src), OpcodeStr, "$src", "$src", + (_Dst.VT (X86SubVBroadcast + (_Src.VT (bitconvert (_Src.LdFrag addr:$src)))))>, + Sched<[SchedWriteShuffle.YMM.Folded]>, + AVX5128IBase, EVEX; +} + +// This should be used for the AVX512DQ broadcast instructions. It disables +// the unmasked patterns so that we only use the DQ instructions when masking +// is requested. +multiclass avx512_subvec_broadcast_rm_dq<bits<8> opc, string OpcodeStr, + X86VectorVTInfo _Dst, X86VectorVTInfo _Src> { + let hasSideEffects = 0, mayLoad = 1 in + defm rm : AVX512_maskable_split<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), + (ins _Src.MemOp:$src), OpcodeStr, "$src", "$src", + (null_frag), + (_Dst.VT (X86SubVBroadcast + (_Src.VT (bitconvert (_Src.LdFrag addr:$src)))))>, + Sched<[SchedWriteShuffle.YMM.Folded]>, + AVX5128IBase, EVEX; +} + +let Predicates = [HasAVX512] in { + // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD. + def : Pat<(v8i64 (X86VBroadcast (v8i64 (X86vzload addr:$src)))), + (VPBROADCASTQZm addr:$src)>; +} + +let Predicates = [HasVLX] in { + // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD. + def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))), + (VPBROADCASTQZ128m addr:$src)>; + def : Pat<(v4i64 (X86VBroadcast (v4i64 (X86vzload addr:$src)))), + (VPBROADCASTQZ256m addr:$src)>; +} +let Predicates = [HasVLX, HasBWI] in { + // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably. + // This means we'll encounter truncated i32 loads; match that here. + def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), + (VPBROADCASTWZ128m addr:$src)>; + def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), + (VPBROADCASTWZ256m addr:$src)>; + def : Pat<(v8i16 (X86VBroadcast + (i16 (trunc (i32 (zextloadi16 addr:$src)))))), + (VPBROADCASTWZ128m addr:$src)>; + def : Pat<(v16i16 (X86VBroadcast + (i16 (trunc (i32 (zextloadi16 addr:$src)))))), + (VPBROADCASTWZ256m addr:$src)>; +} + +//===----------------------------------------------------------------------===// +// AVX-512 BROADCAST SUBVECTORS +// + +defm VBROADCASTI32X4 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4", + v16i32_info, v4i32x_info>, + EVEX_V512, EVEX_CD8<32, CD8VT4>; +defm VBROADCASTF32X4 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf32x4", + v16f32_info, v4f32x_info>, + EVEX_V512, EVEX_CD8<32, CD8VT4>; +defm VBROADCASTI64X4 : avx512_subvec_broadcast_rm<0x5b, "vbroadcasti64x4", + v8i64_info, v4i64x_info>, VEX_W, + EVEX_V512, EVEX_CD8<64, CD8VT4>; +defm VBROADCASTF64X4 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf64x4", + v8f64_info, v4f64x_info>, VEX_W, + EVEX_V512, EVEX_CD8<64, CD8VT4>; + +let Predicates = [HasAVX512] in { +def : Pat<(v16f32 (X86SubVBroadcast (loadv8f32 addr:$src))), + (VBROADCASTF64X4rm addr:$src)>; +def : Pat<(v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src)))), + (VBROADCASTI64X4rm addr:$src)>; +def : Pat<(v32i16 (X86SubVBroadcast (bc_v16i16 (loadv4i64 addr:$src)))), + (VBROADCASTI64X4rm addr:$src)>; +def : Pat<(v64i8 (X86SubVBroadcast (bc_v32i8 (loadv4i64 addr:$src)))), + (VBROADCASTI64X4rm addr:$src)>; + +// Provide fallback in case the load node that is used in the patterns above +// is used by additional users, which prevents the pattern selection. +def : Pat<(v8f64 (X86SubVBroadcast (v4f64 VR256X:$src))), + (VINSERTF64x4Zrr (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (v4f64 VR256X:$src), 1)>; +def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))), + (VINSERTF64x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (v8f32 VR256X:$src), 1)>; +def : Pat<(v8i64 (X86SubVBroadcast (v4i64 VR256X:$src))), + (VINSERTI64x4Zrr (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (v4i64 VR256X:$src), 1)>; +def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))), + (VINSERTI64x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (v8i32 VR256X:$src), 1)>; +def : Pat<(v32i16 (X86SubVBroadcast (v16i16 VR256X:$src))), + (VINSERTI64x4Zrr (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (v16i16 VR256X:$src), 1)>; +def : Pat<(v64i8 (X86SubVBroadcast (v32i8 VR256X:$src))), + (VINSERTI64x4Zrr (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (v32i8 VR256X:$src), 1)>; + +def : Pat<(v8f64 (X86SubVBroadcast (loadv2f64 addr:$src))), + (VBROADCASTF32X4rm addr:$src)>; +def : Pat<(v8i64 (X86SubVBroadcast (loadv2i64 addr:$src))), + (VBROADCASTI32X4rm addr:$src)>; +def : Pat<(v32i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))), + (VBROADCASTI32X4rm addr:$src)>; +def : Pat<(v64i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))), + (VBROADCASTI32X4rm addr:$src)>; + +// Patterns for selects of bitcasted operations. +def : Pat<(vselect VK16WM:$mask, + (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv2f64 addr:$src)))), + (bc_v16f32 (v16i32 immAllZerosV))), + (VBROADCASTF32X4rmkz VK16WM:$mask, addr:$src)>; +def : Pat<(vselect VK16WM:$mask, + (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv2f64 addr:$src)))), + VR512:$src0), + (VBROADCASTF32X4rmk VR512:$src0, VK16WM:$mask, addr:$src)>; +def : Pat<(vselect VK16WM:$mask, + (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv2i64 addr:$src)))), + (v16i32 immAllZerosV)), + (VBROADCASTI32X4rmkz VK16WM:$mask, addr:$src)>; +def : Pat<(vselect VK16WM:$mask, + (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv2i64 addr:$src)))), + VR512:$src0), + (VBROADCASTI32X4rmk VR512:$src0, VK16WM:$mask, addr:$src)>; + +def : Pat<(vselect VK8WM:$mask, + (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv8f32 addr:$src)))), + (bc_v8f64 (v16i32 immAllZerosV))), + (VBROADCASTF64X4rmkz VK8WM:$mask, addr:$src)>; +def : Pat<(vselect VK8WM:$mask, + (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv8f32 addr:$src)))), + VR512:$src0), + (VBROADCASTF64X4rmk VR512:$src0, VK8WM:$mask, addr:$src)>; +def : Pat<(vselect VK8WM:$mask, + (bc_v8i64 (v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src))))), + (bc_v8i64 (v16i32 immAllZerosV))), + (VBROADCASTI64X4rmkz VK8WM:$mask, addr:$src)>; +def : Pat<(vselect VK8WM:$mask, + (bc_v8i64 (v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src))))), + VR512:$src0), + (VBROADCASTI64X4rmk VR512:$src0, VK8WM:$mask, addr:$src)>; +} + +let Predicates = [HasVLX] in { +defm VBROADCASTI32X4Z256 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4", + v8i32x_info, v4i32x_info>, + EVEX_V256, EVEX_CD8<32, CD8VT4>; +defm VBROADCASTF32X4Z256 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf32x4", + v8f32x_info, v4f32x_info>, + EVEX_V256, EVEX_CD8<32, CD8VT4>; + +def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))), + (VBROADCASTF32X4Z256rm addr:$src)>; +def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))), + (VBROADCASTI32X4Z256rm addr:$src)>; +def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))), + (VBROADCASTI32X4Z256rm addr:$src)>; +def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))), + (VBROADCASTI32X4Z256rm addr:$src)>; + +// Patterns for selects of bitcasted operations. +def : Pat<(vselect VK8WM:$mask, + (bc_v8f32 (v4f64 (X86SubVBroadcast (loadv2f64 addr:$src)))), + (bc_v8f32 (v8i32 immAllZerosV))), + (VBROADCASTF32X4Z256rmkz VK8WM:$mask, addr:$src)>; +def : Pat<(vselect VK8WM:$mask, + (bc_v8f32 (v4f64 (X86SubVBroadcast (loadv2f64 addr:$src)))), + VR256X:$src0), + (VBROADCASTF32X4Z256rmk VR256X:$src0, VK8WM:$mask, addr:$src)>; +def : Pat<(vselect VK8WM:$mask, + (bc_v8i32 (v4i64 (X86SubVBroadcast (loadv2i64 addr:$src)))), + (v8i32 immAllZerosV)), + (VBROADCASTI32X4Z256rmkz VK8WM:$mask, addr:$src)>; +def : Pat<(vselect VK8WM:$mask, + (bc_v8i32 (v4i64 (X86SubVBroadcast (loadv2i64 addr:$src)))), + VR256X:$src0), + (VBROADCASTI32X4Z256rmk VR256X:$src0, VK8WM:$mask, addr:$src)>; + + +// Provide fallback in case the load node that is used in the patterns above +// is used by additional users, which prevents the pattern selection. +def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128X:$src))), + (VINSERTF32x4Z256rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (v2f64 VR128X:$src), 1)>; +def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128X:$src))), + (VINSERTF32x4Z256rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (v4f32 VR128X:$src), 1)>; +def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128X:$src))), + (VINSERTI32x4Z256rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (v2i64 VR128X:$src), 1)>; +def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128X:$src))), + (VINSERTI32x4Z256rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (v4i32 VR128X:$src), 1)>; +def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128X:$src))), + (VINSERTI32x4Z256rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (v8i16 VR128X:$src), 1)>; +def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128X:$src))), + (VINSERTI32x4Z256rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (v16i8 VR128X:$src), 1)>; +} + +let Predicates = [HasVLX, HasDQI] in { +defm VBROADCASTI64X2Z128 : avx512_subvec_broadcast_rm_dq<0x5a, "vbroadcasti64x2", + v4i64x_info, v2i64x_info>, VEX_W1X, + EVEX_V256, EVEX_CD8<64, CD8VT2>; +defm VBROADCASTF64X2Z128 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2", + v4f64x_info, v2f64x_info>, VEX_W1X, + EVEX_V256, EVEX_CD8<64, CD8VT2>; + +// Patterns for selects of bitcasted operations. +def : Pat<(vselect VK4WM:$mask, + (bc_v4f64 (v8f32 (X86SubVBroadcast (loadv4f32 addr:$src)))), + (bc_v4f64 (v8i32 immAllZerosV))), + (VBROADCASTF64X2Z128rmkz VK4WM:$mask, addr:$src)>; +def : Pat<(vselect VK4WM:$mask, + (bc_v4f64 (v8f32 (X86SubVBroadcast (loadv4f32 addr:$src)))), + VR256X:$src0), + (VBROADCASTF64X2Z128rmk VR256X:$src0, VK4WM:$mask, addr:$src)>; +def : Pat<(vselect VK4WM:$mask, + (bc_v4i64 (v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src))))), + (bc_v4i64 (v8i32 immAllZerosV))), + (VBROADCASTI64X2Z128rmkz VK4WM:$mask, addr:$src)>; +def : Pat<(vselect VK4WM:$mask, + (bc_v4i64 (v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src))))), + VR256X:$src0), + (VBROADCASTI64X2Z128rmk VR256X:$src0, VK4WM:$mask, addr:$src)>; +} + +let Predicates = [HasDQI] in { +defm VBROADCASTI64X2 : avx512_subvec_broadcast_rm_dq<0x5a, "vbroadcasti64x2", + v8i64_info, v2i64x_info>, VEX_W, + EVEX_V512, EVEX_CD8<64, CD8VT2>; +defm VBROADCASTI32X8 : avx512_subvec_broadcast_rm_dq<0x5b, "vbroadcasti32x8", + v16i32_info, v8i32x_info>, + EVEX_V512, EVEX_CD8<32, CD8VT8>; +defm VBROADCASTF64X2 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2", + v8f64_info, v2f64x_info>, VEX_W, + EVEX_V512, EVEX_CD8<64, CD8VT2>; +defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm_dq<0x1b, "vbroadcastf32x8", + v16f32_info, v8f32x_info>, + EVEX_V512, EVEX_CD8<32, CD8VT8>; + +// Patterns for selects of bitcasted operations. +def : Pat<(vselect VK16WM:$mask, + (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv4f64 addr:$src)))), + (bc_v16f32 (v16i32 immAllZerosV))), + (VBROADCASTF32X8rmkz VK16WM:$mask, addr:$src)>; +def : Pat<(vselect VK16WM:$mask, + (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv4f64 addr:$src)))), + VR512:$src0), + (VBROADCASTF32X8rmk VR512:$src0, VK16WM:$mask, addr:$src)>; +def : Pat<(vselect VK16WM:$mask, + (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv4i64 addr:$src)))), + (v16i32 immAllZerosV)), + (VBROADCASTI32X8rmkz VK16WM:$mask, addr:$src)>; +def : Pat<(vselect VK16WM:$mask, + (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv4i64 addr:$src)))), + VR512:$src0), + (VBROADCASTI32X8rmk VR512:$src0, VK16WM:$mask, addr:$src)>; + +def : Pat<(vselect VK8WM:$mask, + (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv4f32 addr:$src)))), + (bc_v8f64 (v16i32 immAllZerosV))), + (VBROADCASTF64X2rmkz VK8WM:$mask, addr:$src)>; +def : Pat<(vselect VK8WM:$mask, + (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv4f32 addr:$src)))), + VR512:$src0), + (VBROADCASTF64X2rmk VR512:$src0, VK8WM:$mask, addr:$src)>; +def : Pat<(vselect VK8WM:$mask, + (bc_v8i64 (v16i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src))))), + (bc_v8i64 (v16i32 immAllZerosV))), + (VBROADCASTI64X2rmkz VK8WM:$mask, addr:$src)>; +def : Pat<(vselect VK8WM:$mask, + (bc_v8i64 (v16i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src))))), + VR512:$src0), + (VBROADCASTI64X2rmk VR512:$src0, VK8WM:$mask, addr:$src)>; +} + +multiclass avx512_common_broadcast_32x2<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo _Dst, AVX512VLVectorVTInfo _Src> { + let Predicates = [HasDQI] in + defm Z : avx512_broadcast_rm_split<opc, OpcodeStr, NAME, WriteShuffle256, + WriteShuffle256Ld, _Dst.info512, + _Src.info512, _Src.info128, null_frag>, + EVEX_V512; + let Predicates = [HasDQI, HasVLX] in + defm Z256 : avx512_broadcast_rm_split<opc, OpcodeStr, NAME, WriteShuffle256, + WriteShuffle256Ld, _Dst.info256, + _Src.info256, _Src.info128, null_frag>, + EVEX_V256; +} + +multiclass avx512_common_broadcast_i32x2<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo _Dst, AVX512VLVectorVTInfo _Src> : + avx512_common_broadcast_32x2<opc, OpcodeStr, _Dst, _Src> { + + let Predicates = [HasDQI, HasVLX] in + defm Z128 : avx512_broadcast_rm_split<opc, OpcodeStr, NAME, WriteShuffle, + WriteShuffleXLd, _Dst.info128, + _Src.info128, _Src.info128, null_frag>, + EVEX_V128; +} + +defm VBROADCASTI32X2 : avx512_common_broadcast_i32x2<0x59, "vbroadcasti32x2", + avx512vl_i32_info, avx512vl_i64_info>; +defm VBROADCASTF32X2 : avx512_common_broadcast_32x2<0x19, "vbroadcastf32x2", + avx512vl_f32_info, avx512vl_f64_info>; + +let Predicates = [HasVLX] in { +def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256X:$src))), + (VBROADCASTSSZ256r (v4f32 (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)))>; +def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256X:$src))), + (VBROADCASTSDZ256r (v2f64 (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)))>; +} + +def : Pat<(v16f32 (X86VBroadcast (v16f32 VR512:$src))), + (VBROADCASTSSZr (v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm)))>; +def : Pat<(v16f32 (X86VBroadcast (v8f32 VR256X:$src))), + (VBROADCASTSSZr (v4f32 (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)))>; + +def : Pat<(v8f64 (X86VBroadcast (v8f64 VR512:$src))), + (VBROADCASTSDZr (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)))>; +def : Pat<(v8f64 (X86VBroadcast (v4f64 VR256X:$src))), + (VBROADCASTSDZr (v2f64 (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)))>; + +//===----------------------------------------------------------------------===// +// AVX-512 BROADCAST MASK TO VECTOR REGISTER +//--- +multiclass avx512_mask_broadcastm<bits<8> opc, string OpcodeStr, + X86VectorVTInfo _, RegisterClass KRC> { + def rr : AVX512XS8I<opc, MRMSrcReg, (outs _.RC:$dst), (ins KRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set _.RC:$dst, (_.VT (X86VBroadcastm KRC:$src)))]>, + EVEX, Sched<[WriteShuffle]>; +} + +multiclass avx512_mask_broadcast<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo, RegisterClass KRC> { + let Predicates = [HasCDI] in + defm Z : avx512_mask_broadcastm<opc, OpcodeStr, VTInfo.info512, KRC>, EVEX_V512; + let Predicates = [HasCDI, HasVLX] in { + defm Z256 : avx512_mask_broadcastm<opc, OpcodeStr, VTInfo.info256, KRC>, EVEX_V256; + defm Z128 : avx512_mask_broadcastm<opc, OpcodeStr, VTInfo.info128, KRC>, EVEX_V128; + } +} + +defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d", + avx512vl_i32_info, VK16>; +defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q", + avx512vl_i64_info, VK8>, VEX_W; + +//===----------------------------------------------------------------------===// +// -- VPERMI2 - 3 source operands form -- +multiclass avx512_perm_i<bits<8> opc, string OpcodeStr, + X86FoldableSchedWrite sched, + X86VectorVTInfo _, X86VectorVTInfo IdxVT> { +let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, + hasSideEffects = 0 in { + defm rr: AVX512_maskable_3src_cast<opc, MRMSrcReg, _, IdxVT, (outs _.RC:$dst), + (ins _.RC:$src2, _.RC:$src3), + OpcodeStr, "$src3, $src2", "$src2, $src3", + (_.VT (X86VPermt2 _.RC:$src2, IdxVT.RC:$src1, _.RC:$src3)), 1>, + EVEX_4V, AVX5128IBase, Sched<[sched]>; + + let mayLoad = 1 in + defm rm: AVX512_maskable_3src_cast<opc, MRMSrcMem, _, IdxVT, (outs _.RC:$dst), + (ins _.RC:$src2, _.MemOp:$src3), + OpcodeStr, "$src3, $src2", "$src2, $src3", + (_.VT (X86VPermt2 _.RC:$src2, IdxVT.RC:$src1, + (_.VT (bitconvert (_.LdFrag addr:$src3))))), 1>, + EVEX_4V, AVX5128IBase, Sched<[sched.Folded, ReadAfterLd]>; + } +} + +multiclass avx512_perm_i_mb<bits<8> opc, string OpcodeStr, + X86FoldableSchedWrite sched, + X86VectorVTInfo _, X86VectorVTInfo IdxVT> { + let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, + hasSideEffects = 0, mayLoad = 1 in + defm rmb: AVX512_maskable_3src_cast<opc, MRMSrcMem, _, IdxVT, (outs _.RC:$dst), + (ins _.RC:$src2, _.ScalarMemOp:$src3), + OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), + !strconcat("$src2, ${src3}", _.BroadcastStr ), + (_.VT (X86VPermt2 _.RC:$src2, + IdxVT.RC:$src1,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))), 1>, + AVX5128IBase, EVEX_4V, EVEX_B, + Sched<[sched.Folded, ReadAfterLd]>; +} + +multiclass avx512_perm_i_sizes<bits<8> opc, string OpcodeStr, + X86FoldableSchedWrite sched, + AVX512VLVectorVTInfo VTInfo, + AVX512VLVectorVTInfo ShuffleMask> { + defm NAME: avx512_perm_i<opc, OpcodeStr, sched, VTInfo.info512, + ShuffleMask.info512>, + avx512_perm_i_mb<opc, OpcodeStr, sched, VTInfo.info512, + ShuffleMask.info512>, EVEX_V512; + let Predicates = [HasVLX] in { + defm NAME#128: avx512_perm_i<opc, OpcodeStr, sched, VTInfo.info128, + ShuffleMask.info128>, + avx512_perm_i_mb<opc, OpcodeStr, sched, VTInfo.info128, + ShuffleMask.info128>, EVEX_V128; + defm NAME#256: avx512_perm_i<opc, OpcodeStr, sched, VTInfo.info256, + ShuffleMask.info256>, + avx512_perm_i_mb<opc, OpcodeStr, sched, VTInfo.info256, + ShuffleMask.info256>, EVEX_V256; + } +} + +multiclass avx512_perm_i_sizes_bw<bits<8> opc, string OpcodeStr, + X86FoldableSchedWrite sched, + AVX512VLVectorVTInfo VTInfo, + AVX512VLVectorVTInfo Idx, + Predicate Prd> { + let Predicates = [Prd] in + defm NAME: avx512_perm_i<opc, OpcodeStr, sched, VTInfo.info512, + Idx.info512>, EVEX_V512; + let Predicates = [Prd, HasVLX] in { + defm NAME#128: avx512_perm_i<opc, OpcodeStr, sched, VTInfo.info128, + Idx.info128>, EVEX_V128; + defm NAME#256: avx512_perm_i<opc, OpcodeStr, sched, VTInfo.info256, + Idx.info256>, EVEX_V256; + } +} + +defm VPERMI2D : avx512_perm_i_sizes<0x76, "vpermi2d", WriteVarShuffle256, + avx512vl_i32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; +defm VPERMI2Q : avx512_perm_i_sizes<0x76, "vpermi2q", WriteVarShuffle256, + avx512vl_i64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPERMI2W : avx512_perm_i_sizes_bw<0x75, "vpermi2w", WriteVarShuffle256, + avx512vl_i16_info, avx512vl_i16_info, HasBWI>, + VEX_W, EVEX_CD8<16, CD8VF>; +defm VPERMI2B : avx512_perm_i_sizes_bw<0x75, "vpermi2b", WriteVarShuffle256, + avx512vl_i8_info, avx512vl_i8_info, HasVBMI>, + EVEX_CD8<8, CD8VF>; +defm VPERMI2PS : avx512_perm_i_sizes<0x77, "vpermi2ps", WriteFVarShuffle256, + avx512vl_f32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; +defm VPERMI2PD : avx512_perm_i_sizes<0x77, "vpermi2pd", WriteFVarShuffle256, + avx512vl_f64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>; + +// Extra patterns to deal with extra bitcasts due to passthru and index being +// different types on the fp versions. +multiclass avx512_perm_i_lowering<string InstrStr, X86VectorVTInfo _, + X86VectorVTInfo IdxVT, + X86VectorVTInfo CastVT> { + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (X86VPermt2 (_.VT _.RC:$src2), + (IdxVT.VT (bitconvert (CastVT.VT _.RC:$src1))), _.RC:$src3), + (_.VT (bitconvert (CastVT.VT _.RC:$src1))))), + (!cast<Instruction>(InstrStr#"rrk") _.RC:$src1, _.KRCWM:$mask, + _.RC:$src2, _.RC:$src3)>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (X86VPermt2 _.RC:$src2, + (IdxVT.VT (bitconvert (CastVT.VT _.RC:$src1))), + (_.LdFrag addr:$src3)), + (_.VT (bitconvert (CastVT.VT _.RC:$src1))))), + (!cast<Instruction>(InstrStr#"rmk") _.RC:$src1, _.KRCWM:$mask, + _.RC:$src2, addr:$src3)>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (X86VPermt2 _.RC:$src2, + (IdxVT.VT (bitconvert (CastVT.VT _.RC:$src1))), + (X86VBroadcast (_.ScalarLdFrag addr:$src3))), + (_.VT (bitconvert (CastVT.VT _.RC:$src1))))), + (!cast<Instruction>(InstrStr#"rmbk") _.RC:$src1, _.KRCWM:$mask, + _.RC:$src2, addr:$src3)>; +} + +// TODO: Should we add more casts? The vXi64 case is common due to ABI. +defm : avx512_perm_i_lowering<"VPERMI2PS", v16f32_info, v16i32_info, v8i64_info>; +defm : avx512_perm_i_lowering<"VPERMI2PS256", v8f32x_info, v8i32x_info, v4i64x_info>; +defm : avx512_perm_i_lowering<"VPERMI2PS128", v4f32x_info, v4i32x_info, v2i64x_info>; + +// VPERMT2 +multiclass avx512_perm_t<bits<8> opc, string OpcodeStr, + X86FoldableSchedWrite sched, + X86VectorVTInfo _, X86VectorVTInfo IdxVT> { +let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in { + defm rr: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins IdxVT.RC:$src2, _.RC:$src3), + OpcodeStr, "$src3, $src2", "$src2, $src3", + (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, _.RC:$src3)), 1>, + EVEX_4V, AVX5128IBase, Sched<[sched]>; + + defm rm: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins IdxVT.RC:$src2, _.MemOp:$src3), + OpcodeStr, "$src3, $src2", "$src2, $src3", + (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, + (bitconvert (_.LdFrag addr:$src3)))), 1>, + EVEX_4V, AVX5128IBase, Sched<[sched.Folded, ReadAfterLd]>; + } +} +multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr, + X86FoldableSchedWrite sched, + X86VectorVTInfo _, X86VectorVTInfo IdxVT> { + let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in + defm rmb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins IdxVT.RC:$src2, _.ScalarMemOp:$src3), + OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), + !strconcat("$src2, ${src3}", _.BroadcastStr ), + (_.VT (X86VPermt2 _.RC:$src1, + IdxVT.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))), 1>, + AVX5128IBase, EVEX_4V, EVEX_B, + Sched<[sched.Folded, ReadAfterLd]>; +} + +multiclass avx512_perm_t_sizes<bits<8> opc, string OpcodeStr, + X86FoldableSchedWrite sched, + AVX512VLVectorVTInfo VTInfo, + AVX512VLVectorVTInfo ShuffleMask> { + defm NAME: avx512_perm_t<opc, OpcodeStr, sched, VTInfo.info512, + ShuffleMask.info512>, + avx512_perm_t_mb<opc, OpcodeStr, sched, VTInfo.info512, + ShuffleMask.info512>, EVEX_V512; + let Predicates = [HasVLX] in { + defm NAME#128: avx512_perm_t<opc, OpcodeStr, sched, VTInfo.info128, + ShuffleMask.info128>, + avx512_perm_t_mb<opc, OpcodeStr, sched, VTInfo.info128, + ShuffleMask.info128>, EVEX_V128; + defm NAME#256: avx512_perm_t<opc, OpcodeStr, sched, VTInfo.info256, + ShuffleMask.info256>, + avx512_perm_t_mb<opc, OpcodeStr, sched, VTInfo.info256, + ShuffleMask.info256>, EVEX_V256; + } +} + +multiclass avx512_perm_t_sizes_bw<bits<8> opc, string OpcodeStr, + X86FoldableSchedWrite sched, + AVX512VLVectorVTInfo VTInfo, + AVX512VLVectorVTInfo Idx, Predicate Prd> { + let Predicates = [Prd] in + defm NAME: avx512_perm_t<opc, OpcodeStr, sched, VTInfo.info512, + Idx.info512>, EVEX_V512; + let Predicates = [Prd, HasVLX] in { + defm NAME#128: avx512_perm_t<opc, OpcodeStr, sched, VTInfo.info128, + Idx.info128>, EVEX_V128; + defm NAME#256: avx512_perm_t<opc, OpcodeStr, sched, VTInfo.info256, + Idx.info256>, EVEX_V256; + } +} + +defm VPERMT2D : avx512_perm_t_sizes<0x7E, "vpermt2d", WriteVarShuffle256, + avx512vl_i32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; +defm VPERMT2Q : avx512_perm_t_sizes<0x7E, "vpermt2q", WriteVarShuffle256, + avx512vl_i64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPERMT2W : avx512_perm_t_sizes_bw<0x7D, "vpermt2w", WriteVarShuffle256, + avx512vl_i16_info, avx512vl_i16_info, HasBWI>, + VEX_W, EVEX_CD8<16, CD8VF>; +defm VPERMT2B : avx512_perm_t_sizes_bw<0x7D, "vpermt2b", WriteVarShuffle256, + avx512vl_i8_info, avx512vl_i8_info, HasVBMI>, + EVEX_CD8<8, CD8VF>; +defm VPERMT2PS : avx512_perm_t_sizes<0x7F, "vpermt2ps", WriteFVarShuffle256, + avx512vl_f32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; +defm VPERMT2PD : avx512_perm_t_sizes<0x7F, "vpermt2pd", WriteFVarShuffle256, + avx512vl_f64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>; + +//===----------------------------------------------------------------------===// +// AVX-512 - BLEND using mask +// + +multiclass WriteFVarBlendask<bits<8> opc, string OpcodeStr, + X86FoldableSchedWrite sched, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain, hasSideEffects = 0 in { + def rr : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, ${dst}|${dst}, $src1, $src2}"), []>, + EVEX_4V, Sched<[sched]>; + def rrk : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"), + []>, EVEX_4V, EVEX_K, Sched<[sched]>; + def rrkz : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"), + []>, EVEX_4V, EVEX_KZ, Sched<[sched]>, NotMemoryFoldable; + let mayLoad = 1 in { + def rm : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, ${dst}|${dst}, $src1, $src2}"), + []>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>, + Sched<[sched.Folded, ReadAfterLd]>; + def rmk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"), + []>, EVEX_4V, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>, + Sched<[sched.Folded, ReadAfterLd]>; + def rmkz : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"), + []>, EVEX_4V, EVEX_KZ, EVEX_CD8<_.EltSize, CD8VF>, + Sched<[sched.Folded, ReadAfterLd]>, NotMemoryFoldable; + } + } +} +multiclass WriteFVarBlendask_rmb<bits<8> opc, string OpcodeStr, + X86FoldableSchedWrite sched, X86VectorVTInfo _> { + let mayLoad = 1, hasSideEffects = 0 in { + def rmbk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.RC:$src1, _.ScalarMemOp:$src2), + !strconcat(OpcodeStr, + "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"), []>, + EVEX_4V, EVEX_K, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>, + Sched<[sched.Folded, ReadAfterLd]>; + + def rmbkz : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.RC:$src1, _.ScalarMemOp:$src2), + !strconcat(OpcodeStr, + "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}} {z}|", + "$dst {${mask}} {z}, $src1, ${src2}", _.BroadcastStr, "}"), []>, + EVEX_4V, EVEX_KZ, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>, + Sched<[sched.Folded, ReadAfterLd]>, NotMemoryFoldable; + + def rmb : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2), + !strconcat(OpcodeStr, + "\t{${src2}", _.BroadcastStr, ", $src1, $dst|", + "$dst, $src1, ${src2}", _.BroadcastStr, "}"), []>, + EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>, + Sched<[sched.Folded, ReadAfterLd]>; + } +} + +multiclass blendmask_dq<bits<8> opc, string OpcodeStr, X86SchedWriteWidths sched, + AVX512VLVectorVTInfo VTInfo> { + defm Z : WriteFVarBlendask<opc, OpcodeStr, sched.ZMM, VTInfo.info512>, + WriteFVarBlendask_rmb<opc, OpcodeStr, sched.ZMM, VTInfo.info512>, + EVEX_V512; + + let Predicates = [HasVLX] in { + defm Z256 : WriteFVarBlendask<opc, OpcodeStr, sched.YMM, VTInfo.info256>, + WriteFVarBlendask_rmb<opc, OpcodeStr, sched.YMM, VTInfo.info256>, + EVEX_V256; + defm Z128 : WriteFVarBlendask<opc, OpcodeStr, sched.XMM, VTInfo.info128>, + WriteFVarBlendask_rmb<opc, OpcodeStr, sched.XMM, VTInfo.info128>, + EVEX_V128; + } +} + +multiclass blendmask_bw<bits<8> opc, string OpcodeStr, X86SchedWriteWidths sched, + AVX512VLVectorVTInfo VTInfo> { + let Predicates = [HasBWI] in + defm Z : WriteFVarBlendask<opc, OpcodeStr, sched.ZMM, VTInfo.info512>, + EVEX_V512; + + let Predicates = [HasBWI, HasVLX] in { + defm Z256 : WriteFVarBlendask<opc, OpcodeStr, sched.YMM, VTInfo.info256>, + EVEX_V256; + defm Z128 : WriteFVarBlendask<opc, OpcodeStr, sched.XMM, VTInfo.info128>, + EVEX_V128; + } +} + +defm VBLENDMPS : blendmask_dq<0x65, "vblendmps", SchedWriteFVarBlend, + avx512vl_f32_info>; +defm VBLENDMPD : blendmask_dq<0x65, "vblendmpd", SchedWriteFVarBlend, + avx512vl_f64_info>, VEX_W; +defm VPBLENDMD : blendmask_dq<0x64, "vpblendmd", SchedWriteVarBlend, + avx512vl_i32_info>; +defm VPBLENDMQ : blendmask_dq<0x64, "vpblendmq", SchedWriteVarBlend, + avx512vl_i64_info>, VEX_W; +defm VPBLENDMB : blendmask_bw<0x66, "vpblendmb", SchedWriteVarBlend, + avx512vl_i8_info>; +defm VPBLENDMW : blendmask_bw<0x66, "vpblendmw", SchedWriteVarBlend, + avx512vl_i16_info>, VEX_W; + +//===----------------------------------------------------------------------===// +// Compare Instructions +//===----------------------------------------------------------------------===// + +// avx512_cmp_scalar - AVX512 CMPSS and CMPSD + +multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd, + X86FoldableSchedWrite sched> { + defm rr_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.RC:$src2, AVXCC:$cc), + "vcmp${cc}"#_.Suffix, + "$src2, $src1", "$src1, $src2", + (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + imm:$cc)>, EVEX_4V, Sched<[sched]>; + let mayLoad = 1 in + defm rm_Int : AVX512_maskable_cmp<0xC2, MRMSrcMem, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.IntScalarMemOp:$src2, AVXCC:$cc), + "vcmp${cc}"#_.Suffix, + "$src2, $src1", "$src1, $src2", + (OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2, + imm:$cc)>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>, + Sched<[sched.Folded, ReadAfterLd]>; + + defm rrb_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.RC:$src2, AVXCC:$cc), + "vcmp${cc}"#_.Suffix, + "{sae}, $src2, $src1", "$src1, $src2, {sae}", + (OpNodeRnd (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + imm:$cc, + (i32 FROUND_NO_EXC))>, + EVEX_4V, EVEX_B, Sched<[sched]>; + // Accept explicit immediate argument form instead of comparison code. + let isAsmParserOnly = 1, hasSideEffects = 0 in { + defm rri_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _, + (outs VK1:$dst), + (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc, $src2, $src1", "$src1, $src2, $cc">, EVEX_4V, + Sched<[sched]>, NotMemoryFoldable; + let mayLoad = 1 in + defm rmi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc, $src2, $src1", "$src1, $src2, $cc">, + EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>, + Sched<[sched.Folded, ReadAfterLd]>, NotMemoryFoldable; + + defm rrb_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc, {sae}, $src2, $src1","$src1, $src2, {sae}, $cc">, + EVEX_4V, EVEX_B, Sched<[sched]>, NotMemoryFoldable; + }// let isAsmParserOnly = 1, hasSideEffects = 0 + + let isCodeGenOnly = 1 in { + let isCommutable = 1 in + def rr : AVX512Ii8<0xC2, MRMSrcReg, + (outs _.KRC:$dst), (ins _.FRC:$src1, _.FRC:$src2, AVXCC:$cc), + !strconcat("vcmp${cc}", _.Suffix, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.KRC:$dst, (OpNode _.FRC:$src1, + _.FRC:$src2, + imm:$cc))]>, + EVEX_4V, Sched<[sched]>; + def rm : AVX512Ii8<0xC2, MRMSrcMem, + (outs _.KRC:$dst), + (ins _.FRC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc), + !strconcat("vcmp${cc}", _.Suffix, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.KRC:$dst, (OpNode _.FRC:$src1, + (_.ScalarLdFrag addr:$src2), + imm:$cc))]>, + EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>, + Sched<[sched.Folded, ReadAfterLd]>; + } +} + +let Predicates = [HasAVX512] in { + let ExeDomain = SSEPackedSingle in + defm VCMPSSZ : avx512_cmp_scalar<f32x_info, X86cmpms, X86cmpmsRnd, + SchedWriteFCmp.Scl>, AVX512XSIi8Base; + let ExeDomain = SSEPackedDouble in + defm VCMPSDZ : avx512_cmp_scalar<f64x_info, X86cmpms, X86cmpmsRnd, + SchedWriteFCmp.Scl>, AVX512XDIi8Base, VEX_W; +} + +multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, PatFrag OpNode, + X86FoldableSchedWrite sched, X86VectorVTInfo _, + bit IsCommutable> { + let isCommutable = IsCommutable in + def rr : AVX512BI<opc, MRMSrcReg, + (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)))]>, + EVEX_4V, Sched<[sched]>; + def rm : AVX512BI<opc, MRMSrcMem, + (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), + (_.VT (bitconvert (_.LdFrag addr:$src2)))))]>, + EVEX_4V, Sched<[sched.Folded, ReadAfterLd]>; + let isCommutable = IsCommutable in + def rrk : AVX512BI<opc, MRMSrcReg, + (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, $src2}"), + [(set _.KRC:$dst, (and _.KRCWM:$mask, + (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))))]>, + EVEX_4V, EVEX_K, Sched<[sched]>; + def rmk : AVX512BI<opc, MRMSrcMem, + (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, $src2}"), + [(set _.KRC:$dst, (and _.KRCWM:$mask, + (OpNode (_.VT _.RC:$src1), + (_.VT (bitconvert + (_.LdFrag addr:$src2))))))]>, + EVEX_4V, EVEX_K, Sched<[sched.Folded, ReadAfterLd]>; +} + +multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr, PatFrag OpNode, + X86FoldableSchedWrite sched, X86VectorVTInfo _, + bit IsCommutable> : + avx512_icmp_packed<opc, OpcodeStr, OpNode, sched, _, IsCommutable> { + def rmb : AVX512BI<opc, MRMSrcMem, + (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2), + !strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr, ", $src1, $dst", + "|$dst, $src1, ${src2}", _.BroadcastStr, "}"), + [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), + (X86VBroadcast (_.ScalarLdFrag addr:$src2))))]>, + EVEX_4V, EVEX_B, Sched<[sched.Folded, ReadAfterLd]>; + def rmbk : AVX512BI<opc, MRMSrcMem, + (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, + _.ScalarMemOp:$src2), + !strconcat(OpcodeStr, + "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"), + [(set _.KRC:$dst, (and _.KRCWM:$mask, + (OpNode (_.VT _.RC:$src1), + (X86VBroadcast + (_.ScalarLdFrag addr:$src2)))))]>, + EVEX_4V, EVEX_K, EVEX_B, + Sched<[sched.Folded, ReadAfterLd]>; +} + +multiclass avx512_icmp_packed_vl<bits<8> opc, string OpcodeStr, PatFrag OpNode, + X86SchedWriteWidths sched, + AVX512VLVectorVTInfo VTInfo, Predicate prd, + bit IsCommutable = 0> { + let Predicates = [prd] in + defm Z : avx512_icmp_packed<opc, OpcodeStr, OpNode, sched.ZMM, + VTInfo.info512, IsCommutable>, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_icmp_packed<opc, OpcodeStr, OpNode, sched.YMM, + VTInfo.info256, IsCommutable>, EVEX_V256; + defm Z128 : avx512_icmp_packed<opc, OpcodeStr, OpNode, sched.XMM, + VTInfo.info128, IsCommutable>, EVEX_V128; + } +} + +multiclass avx512_icmp_packed_rmb_vl<bits<8> opc, string OpcodeStr, + PatFrag OpNode, X86SchedWriteWidths sched, + AVX512VLVectorVTInfo VTInfo, + Predicate prd, bit IsCommutable = 0> { + let Predicates = [prd] in + defm Z : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, sched.ZMM, + VTInfo.info512, IsCommutable>, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, sched.YMM, + VTInfo.info256, IsCommutable>, EVEX_V256; + defm Z128 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, sched.XMM, + VTInfo.info128, IsCommutable>, EVEX_V128; + } +} + +// This fragment treats X86cmpm as commutable to help match loads in both +// operands for PCMPEQ. +def X86setcc_commute : SDNode<"ISD::SETCC", SDTSetCC, [SDNPCommutative]>; +def X86pcmpeqm_c : PatFrag<(ops node:$src1, node:$src2), + (X86setcc_commute node:$src1, node:$src2, SETEQ)>; +def X86pcmpgtm : PatFrag<(ops node:$src1, node:$src2), + (setcc node:$src1, node:$src2, SETGT)>; + +// AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't +// increase the pattern complexity the way an immediate would. +let AddedComplexity = 2 in { +// FIXME: Is there a better scheduler class for VPCMP? +defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm_c, + SchedWriteVecALU, avx512vl_i8_info, HasBWI, 1>, + EVEX_CD8<8, CD8VF>, VEX_WIG; + +defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", X86pcmpeqm_c, + SchedWriteVecALU, avx512vl_i16_info, HasBWI, 1>, + EVEX_CD8<16, CD8VF>, VEX_WIG; + +defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", X86pcmpeqm_c, + SchedWriteVecALU, avx512vl_i32_info, HasAVX512, 1>, + EVEX_CD8<32, CD8VF>; + +defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", X86pcmpeqm_c, + SchedWriteVecALU, avx512vl_i64_info, HasAVX512, 1>, + T8PD, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb", X86pcmpgtm, + SchedWriteVecALU, avx512vl_i8_info, HasBWI>, + EVEX_CD8<8, CD8VF>, VEX_WIG; + +defm VPCMPGTW : avx512_icmp_packed_vl<0x65, "vpcmpgtw", X86pcmpgtm, + SchedWriteVecALU, avx512vl_i16_info, HasBWI>, + EVEX_CD8<16, CD8VF>, VEX_WIG; + +defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd", X86pcmpgtm, + SchedWriteVecALU, avx512vl_i32_info, HasAVX512>, + EVEX_CD8<32, CD8VF>; + +defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm, + SchedWriteVecALU, avx512vl_i64_info, HasAVX512>, + T8PD, VEX_W, EVEX_CD8<64, CD8VF>; +} + +multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag, + PatFrag CommFrag, X86FoldableSchedWrite sched, + X86VectorVTInfo _, string Name> { + let isCommutable = 1 in + def rri : AVX512AIi8<opc, MRMSrcReg, + (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, AVX512ICC:$cc), + !strconcat("vpcmp${cc}", Suffix, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.KRC:$dst, (_.KVT (Frag:$cc (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + cond)))]>, + EVEX_4V, Sched<[sched]>; + def rmi : AVX512AIi8<opc, MRMSrcMem, + (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, AVX512ICC:$cc), + !strconcat("vpcmp${cc}", Suffix, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.KRC:$dst, (_.KVT + (Frag:$cc + (_.VT _.RC:$src1), + (_.VT (bitconvert (_.LdFrag addr:$src2))), + cond)))]>, + EVEX_4V, Sched<[sched.Folded, ReadAfterLd]>; + let isCommutable = 1 in + def rrik : AVX512AIi8<opc, MRMSrcReg, + (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2, + AVX512ICC:$cc), + !strconcat("vpcmp${cc}", Suffix, + "\t{$src2, $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, $src2}"), + [(set _.KRC:$dst, (and _.KRCWM:$mask, + (_.KVT (Frag:$cc (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + cond))))]>, + EVEX_4V, EVEX_K, Sched<[sched]>; + def rmik : AVX512AIi8<opc, MRMSrcMem, + (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2, + AVX512ICC:$cc), + !strconcat("vpcmp${cc}", Suffix, + "\t{$src2, $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, $src2}"), + [(set _.KRC:$dst, (and _.KRCWM:$mask, + (_.KVT + (Frag:$cc + (_.VT _.RC:$src1), + (_.VT (bitconvert + (_.LdFrag addr:$src2))), + cond))))]>, + EVEX_4V, EVEX_K, Sched<[sched.Folded, ReadAfterLd]>; + + // Accept explicit immediate argument form instead of comparison code. + let isAsmParserOnly = 1, hasSideEffects = 0 in { + def rri_alt : AVX512AIi8<opc, MRMSrcReg, + (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), + !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst|", + "$dst, $src1, $src2, $cc}"), []>, + EVEX_4V, Sched<[sched]>, NotMemoryFoldable; + let mayLoad = 1 in + def rmi_alt : AVX512AIi8<opc, MRMSrcMem, + (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc), + !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst|", + "$dst, $src1, $src2, $cc}"), []>, + EVEX_4V, Sched<[sched.Folded, ReadAfterLd]>, NotMemoryFoldable; + def rrik_alt : AVX512AIi8<opc, MRMSrcReg, + (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2, + u8imm:$cc), + !strconcat("vpcmp", Suffix, + "\t{$cc, $src2, $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, $src2, $cc}"), []>, + EVEX_4V, EVEX_K, Sched<[sched]>, NotMemoryFoldable; + let mayLoad = 1 in + def rmik_alt : AVX512AIi8<opc, MRMSrcMem, + (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2, + u8imm:$cc), + !strconcat("vpcmp", Suffix, + "\t{$cc, $src2, $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, $src2, $cc}"), []>, + EVEX_4V, EVEX_K, Sched<[sched.Folded, ReadAfterLd]>, + NotMemoryFoldable; + } + + def : Pat<(_.KVT (CommFrag:$cc (bitconvert (_.LdFrag addr:$src2)), + (_.VT _.RC:$src1), cond)), + (!cast<Instruction>(Name#_.ZSuffix#"rmi") + _.RC:$src1, addr:$src2, (CommFrag.OperandTransform $cc))>; + + def : Pat<(and _.KRCWM:$mask, + (_.KVT (CommFrag:$cc (bitconvert (_.LdFrag addr:$src2)), + (_.VT _.RC:$src1), cond))), + (!cast<Instruction>(Name#_.ZSuffix#"rmik") + _.KRCWM:$mask, _.RC:$src1, addr:$src2, + (CommFrag.OperandTransform $cc))>; +} + +multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, PatFrag Frag, + PatFrag CommFrag, X86FoldableSchedWrite sched, + X86VectorVTInfo _, string Name> : + avx512_icmp_cc<opc, Suffix, Frag, CommFrag, sched, _, Name> { + def rmib : AVX512AIi8<opc, MRMSrcMem, + (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2, + AVX512ICC:$cc), + !strconcat("vpcmp${cc}", Suffix, + "\t{${src2}", _.BroadcastStr, ", $src1, $dst|", + "$dst, $src1, ${src2}", _.BroadcastStr, "}"), + [(set _.KRC:$dst, (_.KVT (Frag:$cc + (_.VT _.RC:$src1), + (X86VBroadcast + (_.ScalarLdFrag addr:$src2)), + cond)))]>, + EVEX_4V, EVEX_B, Sched<[sched.Folded, ReadAfterLd]>; + def rmibk : AVX512AIi8<opc, MRMSrcMem, + (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, + _.ScalarMemOp:$src2, AVX512ICC:$cc), + !strconcat("vpcmp${cc}", Suffix, + "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"), + [(set _.KRC:$dst, (and _.KRCWM:$mask, + (_.KVT (Frag:$cc + (_.VT _.RC:$src1), + (X86VBroadcast + (_.ScalarLdFrag addr:$src2)), + cond))))]>, + EVEX_4V, EVEX_K, EVEX_B, Sched<[sched.Folded, ReadAfterLd]>; + + // Accept explicit immediate argument form instead of comparison code. + let isAsmParserOnly = 1, hasSideEffects = 0, mayLoad = 1 in { + def rmib_alt : AVX512AIi8<opc, MRMSrcMem, + (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2, + u8imm:$cc), + !strconcat("vpcmp", Suffix, + "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst|", + "$dst, $src1, ${src2}", _.BroadcastStr, ", $cc}"), []>, + EVEX_4V, EVEX_B, Sched<[sched.Folded, ReadAfterLd]>, + NotMemoryFoldable; + def rmibk_alt : AVX512AIi8<opc, MRMSrcMem, + (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, + _.ScalarMemOp:$src2, u8imm:$cc), + !strconcat("vpcmp", Suffix, + "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, ", $cc}"), []>, + EVEX_4V, EVEX_K, EVEX_B, Sched<[sched.Folded, ReadAfterLd]>, + NotMemoryFoldable; + } + + def : Pat<(_.KVT (CommFrag:$cc (X86VBroadcast (_.ScalarLdFrag addr:$src2)), + (_.VT _.RC:$src1), cond)), + (!cast<Instruction>(Name#_.ZSuffix#"rmib") + _.RC:$src1, addr:$src2, (CommFrag.OperandTransform $cc))>; + + def : Pat<(and _.KRCWM:$mask, + (_.KVT (CommFrag:$cc (X86VBroadcast + (_.ScalarLdFrag addr:$src2)), + (_.VT _.RC:$src1), cond))), + (!cast<Instruction>(Name#_.ZSuffix#"rmibk") + _.KRCWM:$mask, _.RC:$src1, addr:$src2, + (CommFrag.OperandTransform $cc))>; +} + +multiclass avx512_icmp_cc_vl<bits<8> opc, string Suffix, PatFrag Frag, + PatFrag CommFrag, X86SchedWriteWidths sched, + AVX512VLVectorVTInfo VTInfo, Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_icmp_cc<opc, Suffix, Frag, CommFrag, sched.ZMM, + VTInfo.info512, NAME>, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_icmp_cc<opc, Suffix, Frag, CommFrag, sched.YMM, + VTInfo.info256, NAME>, EVEX_V256; + defm Z128 : avx512_icmp_cc<opc, Suffix, Frag, CommFrag, sched.XMM, + VTInfo.info128, NAME>, EVEX_V128; + } +} + +multiclass avx512_icmp_cc_rmb_vl<bits<8> opc, string Suffix, PatFrag Frag, + PatFrag CommFrag, X86SchedWriteWidths sched, + AVX512VLVectorVTInfo VTInfo, Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_icmp_cc_rmb<opc, Suffix, Frag, CommFrag, sched.ZMM, + VTInfo.info512, NAME>, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_icmp_cc_rmb<opc, Suffix, Frag, CommFrag, sched.YMM, + VTInfo.info256, NAME>, EVEX_V256; + defm Z128 : avx512_icmp_cc_rmb<opc, Suffix, Frag, CommFrag, sched.XMM, + VTInfo.info128, NAME>, EVEX_V128; + } +} + +def X86pcmpm_imm : SDNodeXForm<setcc, [{ + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); + uint8_t SSECC = X86::getVPCMPImmForCond(CC); + return getI8Imm(SSECC, SDLoc(N)); +}]>; + +// Swapped operand version of the above. +def X86pcmpm_imm_commute : SDNodeXForm<setcc, [{ + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); + uint8_t SSECC = X86::getVPCMPImmForCond(CC); + SSECC = X86::getSwappedVPCMPImm(SSECC); + return getI8Imm(SSECC, SDLoc(N)); +}]>; + +def X86pcmpm : PatFrag<(ops node:$src1, node:$src2, node:$cc), + (setcc node:$src1, node:$src2, node:$cc), [{ + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); + return !ISD::isUnsignedIntSetCC(CC); +}], X86pcmpm_imm>; + +// Same as above, but commutes immediate. Use for load folding. +def X86pcmpm_commute : PatFrag<(ops node:$src1, node:$src2, node:$cc), + (setcc node:$src1, node:$src2, node:$cc), [{ + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); + return !ISD::isUnsignedIntSetCC(CC); +}], X86pcmpm_imm_commute>; + +def X86pcmpum : PatFrag<(ops node:$src1, node:$src2, node:$cc), + (setcc node:$src1, node:$src2, node:$cc), [{ + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); + return ISD::isUnsignedIntSetCC(CC); +}], X86pcmpm_imm>; + +// Same as above, but commutes immediate. Use for load folding. +def X86pcmpum_commute : PatFrag<(ops node:$src1, node:$src2, node:$cc), + (setcc node:$src1, node:$src2, node:$cc), [{ + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); + return ISD::isUnsignedIntSetCC(CC); +}], X86pcmpm_imm_commute>; + +// FIXME: Is there a better scheduler class for VPCMP/VPCMPU? +defm VPCMPB : avx512_icmp_cc_vl<0x3F, "b", X86pcmpm, X86pcmpm_commute, + SchedWriteVecALU, avx512vl_i8_info, HasBWI>, + EVEX_CD8<8, CD8VF>; +defm VPCMPUB : avx512_icmp_cc_vl<0x3E, "ub", X86pcmpum, X86pcmpum_commute, + SchedWriteVecALU, avx512vl_i8_info, HasBWI>, + EVEX_CD8<8, CD8VF>; + +defm VPCMPW : avx512_icmp_cc_vl<0x3F, "w", X86pcmpm, X86pcmpm_commute, + SchedWriteVecALU, avx512vl_i16_info, HasBWI>, + VEX_W, EVEX_CD8<16, CD8VF>; +defm VPCMPUW : avx512_icmp_cc_vl<0x3E, "uw", X86pcmpum, X86pcmpum_commute, + SchedWriteVecALU, avx512vl_i16_info, HasBWI>, + VEX_W, EVEX_CD8<16, CD8VF>; + +defm VPCMPD : avx512_icmp_cc_rmb_vl<0x1F, "d", X86pcmpm, X86pcmpm_commute, + SchedWriteVecALU, avx512vl_i32_info, + HasAVX512>, EVEX_CD8<32, CD8VF>; +defm VPCMPUD : avx512_icmp_cc_rmb_vl<0x1E, "ud", X86pcmpum, X86pcmpum_commute, + SchedWriteVecALU, avx512vl_i32_info, + HasAVX512>, EVEX_CD8<32, CD8VF>; + +defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86pcmpm, X86pcmpm_commute, + SchedWriteVecALU, avx512vl_i64_info, + HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86pcmpum, X86pcmpum_commute, + SchedWriteVecALU, avx512vl_i64_info, + HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>; + +multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _, + string Name> { + defm rri : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, + (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2,AVXCC:$cc), + "vcmp${cc}"#_.Suffix, + "$src2, $src1", "$src1, $src2", + (X86cmpm (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + imm:$cc), 1>, + Sched<[sched]>; + + defm rmi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _, + (outs _.KRC:$dst),(ins _.RC:$src1, _.MemOp:$src2, AVXCC:$cc), + "vcmp${cc}"#_.Suffix, + "$src2, $src1", "$src1, $src2", + (X86cmpm (_.VT _.RC:$src1), + (_.VT (bitconvert (_.LdFrag addr:$src2))), + imm:$cc)>, + Sched<[sched.Folded, ReadAfterLd]>; + + defm rmbi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc), + "vcmp${cc}"#_.Suffix, + "${src2}"##_.BroadcastStr##", $src1", + "$src1, ${src2}"##_.BroadcastStr, + (X86cmpm (_.VT _.RC:$src1), + (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), + imm:$cc)>, + EVEX_B, Sched<[sched.Folded, ReadAfterLd]>; + // Accept explicit immediate argument form instead of comparison code. + let isAsmParserOnly = 1, hasSideEffects = 0 in { + defm rri_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc, $src2, $src1", "$src1, $src2, $cc">, + Sched<[sched]>, NotMemoryFoldable; + + let mayLoad = 1 in { + defm rmi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc, $src2, $src1", "$src1, $src2, $cc">, + Sched<[sched.Folded, ReadAfterLd]>, + NotMemoryFoldable; + + defm rmbi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc, ${src2}"##_.BroadcastStr##", $src1", + "$src1, ${src2}"##_.BroadcastStr##", $cc">, + EVEX_B, Sched<[sched.Folded, ReadAfterLd]>, + NotMemoryFoldable; + } + } + + // Patterns for selecting with loads in other operand. + def : Pat<(X86cmpm (_.LdFrag addr:$src2), (_.VT _.RC:$src1), + CommutableCMPCC:$cc), + (!cast<Instruction>(Name#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2, + imm:$cc)>; + + def : Pat<(and _.KRCWM:$mask, (X86cmpm (_.LdFrag addr:$src2), + (_.VT _.RC:$src1), + CommutableCMPCC:$cc)), + (!cast<Instruction>(Name#_.ZSuffix#"rmik") _.KRCWM:$mask, + _.RC:$src1, addr:$src2, + imm:$cc)>; + + def : Pat<(X86cmpm (X86VBroadcast (_.ScalarLdFrag addr:$src2)), + (_.VT _.RC:$src1), CommutableCMPCC:$cc), + (!cast<Instruction>(Name#_.ZSuffix#"rmbi") _.RC:$src1, addr:$src2, + imm:$cc)>; + + def : Pat<(and _.KRCWM:$mask, (X86cmpm (X86VBroadcast + (_.ScalarLdFrag addr:$src2)), + (_.VT _.RC:$src1), + CommutableCMPCC:$cc)), + (!cast<Instruction>(Name#_.ZSuffix#"rmbik") _.KRCWM:$mask, + _.RC:$src1, addr:$src2, + imm:$cc)>; +} + +multiclass avx512_vcmp_sae<X86FoldableSchedWrite sched, X86VectorVTInfo _> { + // comparison code form (VCMP[EQ/LT/LE/...] + defm rrib : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, + (outs _.KRC:$dst),(ins _.RC:$src1, _.RC:$src2, AVXCC:$cc), + "vcmp${cc}"#_.Suffix, + "{sae}, $src2, $src1", "$src1, $src2, {sae}", + (X86cmpmRnd (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + imm:$cc, + (i32 FROUND_NO_EXC))>, + EVEX_B, Sched<[sched]>; + + let isAsmParserOnly = 1, hasSideEffects = 0 in { + defm rrib_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc, {sae}, $src2, $src1", + "$src1, $src2, {sae}, $cc">, + EVEX_B, Sched<[sched]>, NotMemoryFoldable; + } +} + +multiclass avx512_vcmp<X86SchedWriteWidths sched, AVX512VLVectorVTInfo _> { + let Predicates = [HasAVX512] in { + defm Z : avx512_vcmp_common<sched.ZMM, _.info512, NAME>, + avx512_vcmp_sae<sched.ZMM, _.info512>, EVEX_V512; + + } + let Predicates = [HasAVX512,HasVLX] in { + defm Z128 : avx512_vcmp_common<sched.XMM, _.info128, NAME>, EVEX_V128; + defm Z256 : avx512_vcmp_common<sched.YMM, _.info256, NAME>, EVEX_V256; + } +} + +defm VCMPPD : avx512_vcmp<SchedWriteFCmp, avx512vl_f64_info>, + AVX512PDIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W; +defm VCMPPS : avx512_vcmp<SchedWriteFCmp, avx512vl_f32_info>, + AVX512PSIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>; + +// Patterns to select fp compares with load as first operand. +let Predicates = [HasAVX512] in { + def : Pat<(v1i1 (X86cmpms (loadf64 addr:$src2), FR64X:$src1, + CommutableCMPCC:$cc)), + (VCMPSDZrm FR64X:$src1, addr:$src2, imm:$cc)>; + + def : Pat<(v1i1 (X86cmpms (loadf32 addr:$src2), FR32X:$src1, + CommutableCMPCC:$cc)), + (VCMPSSZrm FR32X:$src1, addr:$src2, imm:$cc)>; +} + +// ---------------------------------------------------------------- +// FPClass +//handle fpclass instruction mask = op(reg_scalar,imm) +// op(mem_scalar,imm) +multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, X86VectorVTInfo _, + Predicate prd> { + let Predicates = [prd], ExeDomain = _.ExeDomain in { + def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst), + (ins _.RC:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set _.KRC:$dst,(OpNode (_.VT _.RC:$src1), + (i32 imm:$src2)))]>, + Sched<[sched]>; + def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst), + (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix# + "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", + [(set _.KRC:$dst,(and _.KRCWM:$mask, + (OpNode (_.VT _.RC:$src1), + (i32 imm:$src2))))]>, + EVEX_K, Sched<[sched]>; + def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), + (ins _.IntScalarMemOp:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix## + "\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set _.KRC:$dst, + (OpNode _.ScalarIntMemCPat:$src1, + (i32 imm:$src2)))]>, + Sched<[sched.Folded, ReadAfterLd]>; + def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), + (ins _.KRCWM:$mask, _.IntScalarMemOp:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix## + "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", + [(set _.KRC:$dst,(and _.KRCWM:$mask, + (OpNode _.ScalarIntMemCPat:$src1, + (i32 imm:$src2))))]>, + EVEX_K, Sched<[sched.Folded, ReadAfterLd]>; + } +} + +//handle fpclass instruction mask = fpclass(reg_vec, reg_vec, imm) +// fpclass(reg_vec, mem_vec, imm) +// fpclass(reg_vec, broadcast(eltVt), imm) +multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, X86VectorVTInfo _, + string mem, string broadcast>{ + let ExeDomain = _.ExeDomain in { + def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst), + (ins _.RC:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set _.KRC:$dst,(OpNode (_.VT _.RC:$src1), + (i32 imm:$src2)))]>, + Sched<[sched]>; + def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst), + (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix# + "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", + [(set _.KRC:$dst,(and _.KRCWM:$mask, + (OpNode (_.VT _.RC:$src1), + (i32 imm:$src2))))]>, + EVEX_K, Sched<[sched]>; + def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), + (ins _.MemOp:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix##mem# + "\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set _.KRC:$dst,(OpNode + (_.VT (bitconvert (_.LdFrag addr:$src1))), + (i32 imm:$src2)))]>, + Sched<[sched.Folded, ReadAfterLd]>; + def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), + (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix##mem# + "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", + [(set _.KRC:$dst, (and _.KRCWM:$mask, (OpNode + (_.VT (bitconvert (_.LdFrag addr:$src1))), + (i32 imm:$src2))))]>, + EVEX_K, Sched<[sched.Folded, ReadAfterLd]>; + def rmb : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), + (ins _.ScalarMemOp:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"## + _.BroadcastStr##", $dst|$dst, ${src1}" + ##_.BroadcastStr##", $src2}", + [(set _.KRC:$dst,(OpNode + (_.VT (X86VBroadcast + (_.ScalarLdFrag addr:$src1))), + (i32 imm:$src2)))]>, + EVEX_B, Sched<[sched.Folded, ReadAfterLd]>; + def rmbk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), + (ins _.KRCWM:$mask, _.ScalarMemOp:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"## + _.BroadcastStr##", $dst {${mask}}|$dst {${mask}}, ${src1}"## + _.BroadcastStr##", $src2}", + [(set _.KRC:$dst,(and _.KRCWM:$mask, (OpNode + (_.VT (X86VBroadcast + (_.ScalarLdFrag addr:$src1))), + (i32 imm:$src2))))]>, + EVEX_B, EVEX_K, Sched<[sched.Folded, ReadAfterLd]>; + } +} + +multiclass avx512_vector_fpclass_all<string OpcodeStr, AVX512VLVectorVTInfo _, + bits<8> opc, SDNode OpNode, + X86SchedWriteWidths sched, Predicate prd, + string broadcast>{ + let Predicates = [prd] in { + defm Z : avx512_vector_fpclass<opc, OpcodeStr, OpNode, sched.ZMM, + _.info512, "{z}", broadcast>, EVEX_V512; + } + let Predicates = [prd, HasVLX] in { + defm Z128 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, sched.XMM, + _.info128, "{x}", broadcast>, EVEX_V128; + defm Z256 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, sched.YMM, + _.info256, "{y}", broadcast>, EVEX_V256; + } +} + +multiclass avx512_fp_fpclass_all<string OpcodeStr, bits<8> opcVec, + bits<8> opcScalar, SDNode VecOpNode, + SDNode ScalarOpNode, X86SchedWriteWidths sched, + Predicate prd> { + defm PS : avx512_vector_fpclass_all<OpcodeStr, avx512vl_f32_info, opcVec, + VecOpNode, sched, prd, "{l}">, + EVEX_CD8<32, CD8VF>; + defm PD : avx512_vector_fpclass_all<OpcodeStr, avx512vl_f64_info, opcVec, + VecOpNode, sched, prd, "{q}">, + EVEX_CD8<64, CD8VF> , VEX_W; + defm SSZ : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode, + sched.Scl, f32x_info, prd>, + EVEX_CD8<32, CD8VT1>; + defm SDZ : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode, + sched.Scl, f64x_info, prd>, + EVEX_CD8<64, CD8VT1>, VEX_W; +} + +defm VFPCLASS : avx512_fp_fpclass_all<"vfpclass", 0x66, 0x67, X86Vfpclass, + X86Vfpclasss, SchedWriteFCmp, HasDQI>, + AVX512AIi8Base, EVEX; + +//----------------------------------------------------------------- +// Mask register copy, including +// - copy between mask registers +// - load/store mask registers +// - copy from GPR to mask register and vice versa +// +multiclass avx512_mask_mov<bits<8> opc_kk, bits<8> opc_km, bits<8> opc_mk, + string OpcodeStr, RegisterClass KRC, + ValueType vvt, X86MemOperand x86memop> { + let isMoveReg = 1, hasSideEffects = 0, SchedRW = [WriteMove] in + def kk : I<opc_kk, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>, + Sched<[WriteMove]>; + def km : I<opc_km, MRMSrcMem, (outs KRC:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set KRC:$dst, (vvt (load addr:$src)))]>, + Sched<[WriteLoad]>; + def mk : I<opc_mk, MRMDestMem, (outs), (ins x86memop:$dst, KRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(store KRC:$src, addr:$dst)]>, + Sched<[WriteStore]>; +} + +multiclass avx512_mask_mov_gpr<bits<8> opc_kr, bits<8> opc_rk, + string OpcodeStr, + RegisterClass KRC, RegisterClass GRC> { + let hasSideEffects = 0 in { + def kr : I<opc_kr, MRMSrcReg, (outs KRC:$dst), (ins GRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>, + Sched<[WriteMove]>; + def rk : I<opc_rk, MRMSrcReg, (outs GRC:$dst), (ins KRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>, + Sched<[WriteMove]>; + } +} + +let Predicates = [HasDQI] in + defm KMOVB : avx512_mask_mov<0x90, 0x90, 0x91, "kmovb", VK8, v8i1, i8mem>, + avx512_mask_mov_gpr<0x92, 0x93, "kmovb", VK8, GR32>, + VEX, PD; + +let Predicates = [HasAVX512] in + defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16mem>, + avx512_mask_mov_gpr<0x92, 0x93, "kmovw", VK16, GR32>, + VEX, PS; + +let Predicates = [HasBWI] in { + defm KMOVD : avx512_mask_mov<0x90, 0x90, 0x91, "kmovd", VK32, v32i1,i32mem>, + VEX, PD, VEX_W; + defm KMOVD : avx512_mask_mov_gpr<0x92, 0x93, "kmovd", VK32, GR32>, + VEX, XD; + defm KMOVQ : avx512_mask_mov<0x90, 0x90, 0x91, "kmovq", VK64, v64i1, i64mem>, + VEX, PS, VEX_W; + defm KMOVQ : avx512_mask_mov_gpr<0x92, 0x93, "kmovq", VK64, GR64>, + VEX, XD, VEX_W; +} + +// GR from/to mask register +def : Pat<(v16i1 (bitconvert (i16 GR16:$src))), + (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit)), VK16)>; +def : Pat<(i16 (bitconvert (v16i1 VK16:$src))), + (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK16:$src, GR32)), sub_16bit)>; + +def : Pat<(v8i1 (bitconvert (i8 GR8:$src))), + (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR8:$src, sub_8bit)), VK8)>; +def : Pat<(i8 (bitconvert (v8i1 VK8:$src))), + (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK8:$src, GR32)), sub_8bit)>; + +def : Pat<(i32 (zext (i16 (bitconvert (v16i1 VK16:$src))))), + (KMOVWrk VK16:$src)>; +def : Pat<(i32 (anyext (i16 (bitconvert (v16i1 VK16:$src))))), + (COPY_TO_REGCLASS VK16:$src, GR32)>; + +def : Pat<(i32 (zext (i8 (bitconvert (v8i1 VK8:$src))))), + (KMOVBrk VK8:$src)>, Requires<[HasDQI]>; +def : Pat<(i32 (anyext (i8 (bitconvert (v8i1 VK8:$src))))), + (COPY_TO_REGCLASS VK8:$src, GR32)>; + +def : Pat<(v32i1 (bitconvert (i32 GR32:$src))), + (COPY_TO_REGCLASS GR32:$src, VK32)>; +def : Pat<(i32 (bitconvert (v32i1 VK32:$src))), + (COPY_TO_REGCLASS VK32:$src, GR32)>; +def : Pat<(v64i1 (bitconvert (i64 GR64:$src))), + (COPY_TO_REGCLASS GR64:$src, VK64)>; +def : Pat<(i64 (bitconvert (v64i1 VK64:$src))), + (COPY_TO_REGCLASS VK64:$src, GR64)>; + +// Load/store kreg +let Predicates = [HasDQI] in { + def : Pat<(store VK1:$src, addr:$dst), + (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK1:$src, VK8))>; + + def : Pat<(v1i1 (load addr:$src)), + (COPY_TO_REGCLASS (KMOVBkm addr:$src), VK1)>; + def : Pat<(v2i1 (load addr:$src)), + (COPY_TO_REGCLASS (KMOVBkm addr:$src), VK2)>; + def : Pat<(v4i1 (load addr:$src)), + (COPY_TO_REGCLASS (KMOVBkm addr:$src), VK4)>; +} + +let Predicates = [HasAVX512] in { + def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))), + (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK8)>; +} + +let Predicates = [HasAVX512] in { + multiclass operation_gpr_mask_copy_lowering<RegisterClass maskRC, ValueType maskVT> { + def : Pat<(maskVT (scalar_to_vector GR32:$src)), + (COPY_TO_REGCLASS GR32:$src, maskRC)>; + + def : Pat<(maskVT (scalar_to_vector GR8:$src)), + (COPY_TO_REGCLASS (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src, sub_8bit), maskRC)>; + } + + defm : operation_gpr_mask_copy_lowering<VK1, v1i1>; + defm : operation_gpr_mask_copy_lowering<VK2, v2i1>; + defm : operation_gpr_mask_copy_lowering<VK4, v4i1>; + defm : operation_gpr_mask_copy_lowering<VK8, v8i1>; + defm : operation_gpr_mask_copy_lowering<VK16, v16i1>; + defm : operation_gpr_mask_copy_lowering<VK32, v32i1>; + defm : operation_gpr_mask_copy_lowering<VK64, v64i1>; + + def : Pat<(insert_subvector (v16i1 immAllZerosV), + (v1i1 (scalar_to_vector GR8:$src)), (iPTR 0)), + (COPY_TO_REGCLASS + (KMOVWkr (AND32ri8 + (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src, sub_8bit), + (i32 1))), VK16)>; +} + +// Mask unary operation +// - KNOT +multiclass avx512_mask_unop<bits<8> opc, string OpcodeStr, + RegisterClass KRC, SDPatternOperator OpNode, + X86FoldableSchedWrite sched, Predicate prd> { + let Predicates = [prd] in + def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set KRC:$dst, (OpNode KRC:$src))]>, + Sched<[sched]>; +} + +multiclass avx512_mask_unop_all<bits<8> opc, string OpcodeStr, + SDPatternOperator OpNode, + X86FoldableSchedWrite sched> { + defm B : avx512_mask_unop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode, + sched, HasDQI>, VEX, PD; + defm W : avx512_mask_unop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode, + sched, HasAVX512>, VEX, PS; + defm D : avx512_mask_unop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode, + sched, HasBWI>, VEX, PD, VEX_W; + defm Q : avx512_mask_unop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode, + sched, HasBWI>, VEX, PS, VEX_W; +} + +// TODO - do we need a X86SchedWriteWidths::KMASK type? +defm KNOT : avx512_mask_unop_all<0x44, "knot", vnot, SchedWriteVecLogic.XMM>; + +// KNL does not support KMOVB, 8-bit mask is promoted to 16-bit +let Predicates = [HasAVX512, NoDQI] in +def : Pat<(vnot VK8:$src), + (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$src, VK16)), VK8)>; + +def : Pat<(vnot VK4:$src), + (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK4:$src, VK16)), VK4)>; +def : Pat<(vnot VK2:$src), + (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK2:$src, VK16)), VK2)>; + +// Mask binary operation +// - KAND, KANDN, KOR, KXNOR, KXOR +multiclass avx512_mask_binop<bits<8> opc, string OpcodeStr, + RegisterClass KRC, SDPatternOperator OpNode, + X86FoldableSchedWrite sched, Predicate prd, + bit IsCommutable> { + let Predicates = [prd], isCommutable = IsCommutable in + def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src1, KRC:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set KRC:$dst, (OpNode KRC:$src1, KRC:$src2))]>, + Sched<[sched]>; +} + +multiclass avx512_mask_binop_all<bits<8> opc, string OpcodeStr, + SDPatternOperator OpNode, + X86FoldableSchedWrite sched, bit IsCommutable, + Predicate prdW = HasAVX512> { + defm B : avx512_mask_binop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode, + sched, HasDQI, IsCommutable>, VEX_4V, VEX_L, PD; + defm W : avx512_mask_binop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode, + sched, prdW, IsCommutable>, VEX_4V, VEX_L, PS; + defm D : avx512_mask_binop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode, + sched, HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PD; + defm Q : avx512_mask_binop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode, + sched, HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PS; +} + +def andn : PatFrag<(ops node:$i0, node:$i1), (and (not node:$i0), node:$i1)>; +def xnor : PatFrag<(ops node:$i0, node:$i1), (not (xor node:$i0, node:$i1))>; +// These nodes use 'vnot' instead of 'not' to support vectors. +def vandn : PatFrag<(ops node:$i0, node:$i1), (and (vnot node:$i0), node:$i1)>; +def vxnor : PatFrag<(ops node:$i0, node:$i1), (vnot (xor node:$i0, node:$i1))>; + +// TODO - do we need a X86SchedWriteWidths::KMASK type? +defm KAND : avx512_mask_binop_all<0x41, "kand", and, SchedWriteVecLogic.XMM, 1>; +defm KOR : avx512_mask_binop_all<0x45, "kor", or, SchedWriteVecLogic.XMM, 1>; +defm KXNOR : avx512_mask_binop_all<0x46, "kxnor", vxnor, SchedWriteVecLogic.XMM, 1>; +defm KXOR : avx512_mask_binop_all<0x47, "kxor", xor, SchedWriteVecLogic.XMM, 1>; +defm KANDN : avx512_mask_binop_all<0x42, "kandn", vandn, SchedWriteVecLogic.XMM, 0>; +defm KADD : avx512_mask_binop_all<0x4A, "kadd", X86kadd, SchedWriteVecLogic.XMM, 1, HasDQI>; + +multiclass avx512_binop_pat<SDPatternOperator VOpNode, SDPatternOperator OpNode, + Instruction Inst> { + // With AVX512F, 8-bit mask is promoted to 16-bit mask, + // for the DQI set, this type is legal and KxxxB instruction is used + let Predicates = [NoDQI] in + def : Pat<(VOpNode VK8:$src1, VK8:$src2), + (COPY_TO_REGCLASS + (Inst (COPY_TO_REGCLASS VK8:$src1, VK16), + (COPY_TO_REGCLASS VK8:$src2, VK16)), VK8)>; + + // All types smaller than 8 bits require conversion anyway + def : Pat<(OpNode VK1:$src1, VK1:$src2), + (COPY_TO_REGCLASS (Inst + (COPY_TO_REGCLASS VK1:$src1, VK16), + (COPY_TO_REGCLASS VK1:$src2, VK16)), VK1)>; + def : Pat<(VOpNode VK2:$src1, VK2:$src2), + (COPY_TO_REGCLASS (Inst + (COPY_TO_REGCLASS VK2:$src1, VK16), + (COPY_TO_REGCLASS VK2:$src2, VK16)), VK1)>; + def : Pat<(VOpNode VK4:$src1, VK4:$src2), + (COPY_TO_REGCLASS (Inst + (COPY_TO_REGCLASS VK4:$src1, VK16), + (COPY_TO_REGCLASS VK4:$src2, VK16)), VK1)>; +} + +defm : avx512_binop_pat<and, and, KANDWrr>; +defm : avx512_binop_pat<vandn, andn, KANDNWrr>; +defm : avx512_binop_pat<or, or, KORWrr>; +defm : avx512_binop_pat<vxnor, xnor, KXNORWrr>; +defm : avx512_binop_pat<xor, xor, KXORWrr>; + +// Mask unpacking +multiclass avx512_mask_unpck<string Suffix,RegisterClass KRC, ValueType VT, + RegisterClass KRCSrc, X86FoldableSchedWrite sched, + Predicate prd> { + let Predicates = [prd] in { + let hasSideEffects = 0 in + def rr : I<0x4b, MRMSrcReg, (outs KRC:$dst), + (ins KRC:$src1, KRC:$src2), + "kunpck"#Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, + VEX_4V, VEX_L, Sched<[sched]>; + + def : Pat<(VT (concat_vectors KRCSrc:$src1, KRCSrc:$src2)), + (!cast<Instruction>(NAME##rr) + (COPY_TO_REGCLASS KRCSrc:$src2, KRC), + (COPY_TO_REGCLASS KRCSrc:$src1, KRC))>; + } +} + +defm KUNPCKBW : avx512_mask_unpck<"bw", VK16, v16i1, VK8, WriteShuffle, HasAVX512>, PD; +defm KUNPCKWD : avx512_mask_unpck<"wd", VK32, v32i1, VK16, WriteShuffle, HasBWI>, PS; +defm KUNPCKDQ : avx512_mask_unpck<"dq", VK64, v64i1, VK32, WriteShuffle, HasBWI>, PS, VEX_W; + +// Mask bit testing +multiclass avx512_mask_testop<bits<8> opc, string OpcodeStr, RegisterClass KRC, + SDNode OpNode, X86FoldableSchedWrite sched, + Predicate prd> { + let Predicates = [prd], Defs = [EFLAGS] in + def rr : I<opc, MRMSrcReg, (outs), (ins KRC:$src1, KRC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), + [(set EFLAGS, (OpNode KRC:$src1, KRC:$src2))]>, + Sched<[sched]>; +} + +multiclass avx512_mask_testop_w<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, + Predicate prdW = HasAVX512> { + defm B : avx512_mask_testop<opc, OpcodeStr#"b", VK8, OpNode, sched, HasDQI>, + VEX, PD; + defm W : avx512_mask_testop<opc, OpcodeStr#"w", VK16, OpNode, sched, prdW>, + VEX, PS; + defm Q : avx512_mask_testop<opc, OpcodeStr#"q", VK64, OpNode, sched, HasBWI>, + VEX, PS, VEX_W; + defm D : avx512_mask_testop<opc, OpcodeStr#"d", VK32, OpNode, sched, HasBWI>, + VEX, PD, VEX_W; +} + +// TODO - do we need a X86SchedWriteWidths::KMASK type? +defm KORTEST : avx512_mask_testop_w<0x98, "kortest", X86kortest, SchedWriteVecLogic.XMM>; +defm KTEST : avx512_mask_testop_w<0x99, "ktest", X86ktest, SchedWriteVecLogic.XMM, HasDQI>; + +// Mask shift +multiclass avx512_mask_shiftop<bits<8> opc, string OpcodeStr, RegisterClass KRC, + SDNode OpNode, X86FoldableSchedWrite sched> { + let Predicates = [HasAVX512] in + def ri : Ii8<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src, u8imm:$imm), + !strconcat(OpcodeStr, + "\t{$imm, $src, $dst|$dst, $src, $imm}"), + [(set KRC:$dst, (OpNode KRC:$src, (i8 imm:$imm)))]>, + Sched<[sched]>; +} + +multiclass avx512_mask_shiftop_w<bits<8> opc1, bits<8> opc2, string OpcodeStr, + SDNode OpNode, X86FoldableSchedWrite sched> { + defm W : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "w"), VK16, OpNode, + sched>, VEX, TAPD, VEX_W; + let Predicates = [HasDQI] in + defm B : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "b"), VK8, OpNode, + sched>, VEX, TAPD; + let Predicates = [HasBWI] in { + defm Q : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "q"), VK64, OpNode, + sched>, VEX, TAPD, VEX_W; + defm D : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "d"), VK32, OpNode, + sched>, VEX, TAPD; + } +} + +defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86kshiftl, WriteShuffle>; +defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86kshiftr, WriteShuffle>; + +// Patterns for comparing 128/256-bit integer vectors using 512-bit instruction. +multiclass axv512_icmp_packed_no_vlx_lowering<PatFrag Frag, string InstStr, + X86VectorVTInfo Narrow, + X86VectorVTInfo Wide> { + def : Pat<(Narrow.KVT (Frag (Narrow.VT Narrow.RC:$src1), + (Narrow.VT Narrow.RC:$src2))), + (COPY_TO_REGCLASS + (!cast<Instruction>(InstStr#"Zrr") + (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), + (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx))), + Narrow.KRC)>; + + def : Pat<(Narrow.KVT (and Narrow.KRC:$mask, + (Frag (Narrow.VT Narrow.RC:$src1), + (Narrow.VT Narrow.RC:$src2)))), + (COPY_TO_REGCLASS + (!cast<Instruction>(InstStr#"Zrrk") + (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC), + (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), + (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx))), + Narrow.KRC)>; +} + +// Patterns for comparing 128/256-bit integer vectors using 512-bit instruction. +multiclass axv512_icmp_packed_cc_no_vlx_lowering<PatFrag Frag, + string InstStr, + X86VectorVTInfo Narrow, + X86VectorVTInfo Wide> { +def : Pat<(Narrow.KVT (Frag:$cc (Narrow.VT Narrow.RC:$src1), + (Narrow.VT Narrow.RC:$src2), cond)), + (COPY_TO_REGCLASS + (!cast<Instruction>(InstStr##Zrri) + (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), + (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx)), + (Frag.OperandTransform $cc)), Narrow.KRC)>; + +def : Pat<(Narrow.KVT (and Narrow.KRC:$mask, + (Narrow.KVT (Frag:$cc (Narrow.VT Narrow.RC:$src1), + (Narrow.VT Narrow.RC:$src2), + cond)))), + (COPY_TO_REGCLASS (!cast<Instruction>(InstStr##Zrrik) + (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC), + (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), + (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx)), + (Frag.OperandTransform $cc)), Narrow.KRC)>; +} + +// Same as above, but for fp types which don't use PatFrags. +multiclass axv512_cmp_packed_cc_no_vlx_lowering<SDNode OpNode, string InstStr, + X86VectorVTInfo Narrow, + X86VectorVTInfo Wide> { +def : Pat<(Narrow.KVT (OpNode (Narrow.VT Narrow.RC:$src1), + (Narrow.VT Narrow.RC:$src2), imm:$cc)), + (COPY_TO_REGCLASS + (!cast<Instruction>(InstStr##Zrri) + (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), + (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx)), + imm:$cc), Narrow.KRC)>; + +def : Pat<(Narrow.KVT (and Narrow.KRC:$mask, + (OpNode (Narrow.VT Narrow.RC:$src1), + (Narrow.VT Narrow.RC:$src2), imm:$cc))), + (COPY_TO_REGCLASS (!cast<Instruction>(InstStr##Zrrik) + (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC), + (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), + (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx)), + imm:$cc), Narrow.KRC)>; +} + +let Predicates = [HasAVX512, NoVLX] in { + // AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't + // increase the pattern complexity the way an immediate would. + let AddedComplexity = 2 in { + defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTD", v8i32x_info, v16i32_info>; + defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQD", v8i32x_info, v16i32_info>; + + defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTD", v4i32x_info, v16i32_info>; + defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQD", v4i32x_info, v16i32_info>; + + defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTQ", v4i64x_info, v8i64_info>; + defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQQ", v4i64x_info, v8i64_info>; + + defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTQ", v2i64x_info, v8i64_info>; + defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQQ", v2i64x_info, v8i64_info>; + } + + defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPD", v8i32x_info, v16i32_info>; + defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUD", v8i32x_info, v16i32_info>; + + defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPD", v4i32x_info, v16i32_info>; + defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUD", v4i32x_info, v16i32_info>; + + defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPQ", v4i64x_info, v8i64_info>; + defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUQ", v4i64x_info, v8i64_info>; + + defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPQ", v2i64x_info, v8i64_info>; + defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUQ", v2i64x_info, v8i64_info>; + + defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, "VCMPPS", v8f32x_info, v16f32_info>; + defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, "VCMPPS", v4f32x_info, v16f32_info>; + defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, "VCMPPD", v4f64x_info, v8f64_info>; + defm : axv512_cmp_packed_cc_no_vlx_lowering<X86cmpm, "VCMPPD", v2f64x_info, v8f64_info>; +} + +let Predicates = [HasBWI, NoVLX] in { + // AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't + // increase the pattern complexity the way an immediate would. + let AddedComplexity = 2 in { + defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTB", v32i8x_info, v64i8_info>; + defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQB", v32i8x_info, v64i8_info>; + + defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTB", v16i8x_info, v64i8_info>; + defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQB", v16i8x_info, v64i8_info>; + + defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTW", v16i16x_info, v32i16_info>; + defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQW", v16i16x_info, v32i16_info>; + + defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpgtm, "VPCMPGTW", v8i16x_info, v32i16_info>; + defm : axv512_icmp_packed_no_vlx_lowering<X86pcmpeqm_c, "VPCMPEQW", v8i16x_info, v32i16_info>; + } + + defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPB", v32i8x_info, v64i8_info>; + defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUB", v32i8x_info, v64i8_info>; + + defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPB", v16i8x_info, v64i8_info>; + defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUB", v16i8x_info, v64i8_info>; + + defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPW", v16i16x_info, v32i16_info>; + defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUW", v16i16x_info, v32i16_info>; + + defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, "VPCMPW", v8i16x_info, v32i16_info>; + defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, "VPCMPUW", v8i16x_info, v32i16_info>; +} + +// Mask setting all 0s or 1s +multiclass avx512_mask_setop<RegisterClass KRC, ValueType VT, PatFrag Val> { + let Predicates = [HasAVX512] in + let isReMaterializable = 1, isAsCheapAsAMove = 1, isPseudo = 1, + SchedRW = [WriteZero] in + def #NAME# : I<0, Pseudo, (outs KRC:$dst), (ins), "", + [(set KRC:$dst, (VT Val))]>; +} + +multiclass avx512_mask_setop_w<PatFrag Val> { + defm W : avx512_mask_setop<VK16, v16i1, Val>; + defm D : avx512_mask_setop<VK32, v32i1, Val>; + defm Q : avx512_mask_setop<VK64, v64i1, Val>; +} + +defm KSET0 : avx512_mask_setop_w<immAllZerosV>; +defm KSET1 : avx512_mask_setop_w<immAllOnesV>; + +// With AVX-512 only, 8-bit mask is promoted to 16-bit mask. +let Predicates = [HasAVX512] in { + def : Pat<(v8i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK8)>; + def : Pat<(v4i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK4)>; + def : Pat<(v2i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK2)>; + def : Pat<(v1i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK1)>; + def : Pat<(v8i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK8)>; + def : Pat<(v4i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK4)>; + def : Pat<(v2i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK2)>; + def : Pat<(v1i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK1)>; +} + +// Patterns for kmask insert_subvector/extract_subvector to/from index=0 +multiclass operation_subvector_mask_lowering<RegisterClass subRC, ValueType subVT, + RegisterClass RC, ValueType VT> { + def : Pat<(subVT (extract_subvector (VT RC:$src), (iPTR 0))), + (subVT (COPY_TO_REGCLASS RC:$src, subRC))>; + + def : Pat<(VT (insert_subvector undef, subRC:$src, (iPTR 0))), + (VT (COPY_TO_REGCLASS subRC:$src, RC))>; +} +defm : operation_subvector_mask_lowering<VK1, v1i1, VK2, v2i1>; +defm : operation_subvector_mask_lowering<VK1, v1i1, VK4, v4i1>; +defm : operation_subvector_mask_lowering<VK1, v1i1, VK8, v8i1>; +defm : operation_subvector_mask_lowering<VK1, v1i1, VK16, v16i1>; +defm : operation_subvector_mask_lowering<VK1, v1i1, VK32, v32i1>; +defm : operation_subvector_mask_lowering<VK1, v1i1, VK64, v64i1>; + +defm : operation_subvector_mask_lowering<VK2, v2i1, VK4, v4i1>; +defm : operation_subvector_mask_lowering<VK2, v2i1, VK8, v8i1>; +defm : operation_subvector_mask_lowering<VK2, v2i1, VK16, v16i1>; +defm : operation_subvector_mask_lowering<VK2, v2i1, VK32, v32i1>; +defm : operation_subvector_mask_lowering<VK2, v2i1, VK64, v64i1>; + +defm : operation_subvector_mask_lowering<VK4, v4i1, VK8, v8i1>; +defm : operation_subvector_mask_lowering<VK4, v4i1, VK16, v16i1>; +defm : operation_subvector_mask_lowering<VK4, v4i1, VK32, v32i1>; +defm : operation_subvector_mask_lowering<VK4, v4i1, VK64, v64i1>; + +defm : operation_subvector_mask_lowering<VK8, v8i1, VK16, v16i1>; +defm : operation_subvector_mask_lowering<VK8, v8i1, VK32, v32i1>; +defm : operation_subvector_mask_lowering<VK8, v8i1, VK64, v64i1>; + +defm : operation_subvector_mask_lowering<VK16, v16i1, VK32, v32i1>; +defm : operation_subvector_mask_lowering<VK16, v16i1, VK64, v64i1>; + +defm : operation_subvector_mask_lowering<VK32, v32i1, VK64, v64i1>; + +//===----------------------------------------------------------------------===// +// AVX-512 - Aligned and unaligned load and store +// + +multiclass avx512_load<bits<8> opc, string OpcodeStr, string Name, + X86VectorVTInfo _, PatFrag ld_frag, PatFrag mload, + X86SchedWriteMoveLS Sched, string EVEX2VEXOvrd, + bit NoRMPattern = 0, + SDPatternOperator SelectOprr = vselect> { + let hasSideEffects = 0 in { + let isMoveReg = 1 in + def rr : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst), (ins _.RC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [], + _.ExeDomain>, EVEX, Sched<[Sched.RR]>, + EVEX2VEXOverride<EVEX2VEXOvrd#"rr">; + def rrkz : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.RC:$src), + !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|", + "${dst} {${mask}} {z}, $src}"), + [(set _.RC:$dst, (_.VT (SelectOprr _.KRCWM:$mask, + (_.VT _.RC:$src), + _.ImmAllZerosV)))], _.ExeDomain>, + EVEX, EVEX_KZ, Sched<[Sched.RR]>; + + let mayLoad = 1, canFoldAsLoad = 1, isReMaterializable = 1 in + def rm : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.MemOp:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + !if(NoRMPattern, [], + [(set _.RC:$dst, + (_.VT (bitconvert (ld_frag addr:$src))))]), + _.ExeDomain>, EVEX, Sched<[Sched.RM]>, + EVEX2VEXOverride<EVEX2VEXOvrd#"rm">; + + let Constraints = "$src0 = $dst", isConvertibleToThreeAddress = 1 in { + def rrk : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst), + (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1), + !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|", + "${dst} {${mask}}, $src1}"), + [(set _.RC:$dst, (_.VT (SelectOprr _.KRCWM:$mask, + (_.VT _.RC:$src1), + (_.VT _.RC:$src0))))], _.ExeDomain>, + EVEX, EVEX_K, Sched<[Sched.RR]>; + def rmk : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.RC:$src0, _.KRCWM:$mask, _.MemOp:$src1), + !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|", + "${dst} {${mask}}, $src1}"), + [(set _.RC:$dst, (_.VT + (vselect _.KRCWM:$mask, + (_.VT (bitconvert (ld_frag addr:$src1))), + (_.VT _.RC:$src0))))], _.ExeDomain>, + EVEX, EVEX_K, Sched<[Sched.RM]>; + } + def rmkz : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.MemOp:$src), + OpcodeStr #"\t{$src, ${dst} {${mask}} {z}|"# + "${dst} {${mask}} {z}, $src}", + [(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask, + (_.VT (bitconvert (ld_frag addr:$src))), _.ImmAllZerosV)))], + _.ExeDomain>, EVEX, EVEX_KZ, Sched<[Sched.RM]>; + } + def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, undef)), + (!cast<Instruction>(Name#_.ZSuffix##rmkz) _.KRCWM:$mask, addr:$ptr)>; + + def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, _.ImmAllZerosV)), + (!cast<Instruction>(Name#_.ZSuffix##rmkz) _.KRCWM:$mask, addr:$ptr)>; + + def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, (_.VT _.RC:$src0))), + (!cast<Instruction>(Name#_.ZSuffix##rmk) _.RC:$src0, + _.KRCWM:$mask, addr:$ptr)>; +} + +multiclass avx512_alignedload_vl<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo _, Predicate prd, + X86SchedWriteMoveLSWidths Sched, + string EVEX2VEXOvrd, bit NoRMPattern = 0> { + let Predicates = [prd] in + defm Z : avx512_load<opc, OpcodeStr, NAME, _.info512, + _.info512.AlignedLdFrag, masked_load_aligned512, + Sched.ZMM, "", NoRMPattern>, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_load<opc, OpcodeStr, NAME, _.info256, + _.info256.AlignedLdFrag, masked_load_aligned256, + Sched.YMM, EVEX2VEXOvrd#"Y", NoRMPattern>, EVEX_V256; + defm Z128 : avx512_load<opc, OpcodeStr, NAME, _.info128, + _.info128.AlignedLdFrag, masked_load_aligned128, + Sched.XMM, EVEX2VEXOvrd, NoRMPattern>, EVEX_V128; + } +} + +multiclass avx512_load_vl<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo _, Predicate prd, + X86SchedWriteMoveLSWidths Sched, + string EVEX2VEXOvrd, bit NoRMPattern = 0, + SDPatternOperator SelectOprr = vselect> { + let Predicates = [prd] in + defm Z : avx512_load<opc, OpcodeStr, NAME, _.info512, _.info512.LdFrag, + masked_load_unaligned, Sched.ZMM, "", + NoRMPattern, SelectOprr>, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_load<opc, OpcodeStr, NAME, _.info256, _.info256.LdFrag, + masked_load_unaligned, Sched.YMM, EVEX2VEXOvrd#"Y", + NoRMPattern, SelectOprr>, EVEX_V256; + defm Z128 : avx512_load<opc, OpcodeStr, NAME, _.info128, _.info128.LdFrag, + masked_load_unaligned, Sched.XMM, EVEX2VEXOvrd, + NoRMPattern, SelectOprr>, EVEX_V128; + } +} + +multiclass avx512_store<bits<8> opc, string OpcodeStr, string BaseName, + X86VectorVTInfo _, PatFrag st_frag, PatFrag mstore, + X86SchedWriteMoveLS Sched, string EVEX2VEXOvrd, + bit NoMRPattern = 0> { + let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in { + let isMoveReg = 1 in + def rr_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.RC:$src), + OpcodeStr # "\t{$src, $dst|$dst, $src}", + [], _.ExeDomain>, EVEX, + FoldGenData<BaseName#_.ZSuffix#rr>, Sched<[Sched.RR]>, + EVEX2VEXOverride<EVEX2VEXOvrd#"rr_REV">; + def rrk_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.RC:$src), + OpcodeStr # "\t{$src, ${dst} {${mask}}|"# + "${dst} {${mask}}, $src}", + [], _.ExeDomain>, EVEX, EVEX_K, + FoldGenData<BaseName#_.ZSuffix#rrk>, + Sched<[Sched.RR]>; + def rrkz_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.RC:$src), + OpcodeStr # "\t{$src, ${dst} {${mask}} {z}|" # + "${dst} {${mask}} {z}, $src}", + [], _.ExeDomain>, EVEX, EVEX_KZ, + FoldGenData<BaseName#_.ZSuffix#rrkz>, + Sched<[Sched.RR]>; + } + + let hasSideEffects = 0, mayStore = 1 in + def mr : AVX512PI<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.RC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + !if(NoMRPattern, [], + [(st_frag (_.VT _.RC:$src), addr:$dst)]), + _.ExeDomain>, EVEX, Sched<[Sched.MR]>, + EVEX2VEXOverride<EVEX2VEXOvrd#"mr">; + def mrk : AVX512PI<opc, MRMDestMem, (outs), + (ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src), + OpcodeStr # "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}", + [], _.ExeDomain>, EVEX, EVEX_K, Sched<[Sched.MR]>, + NotMemoryFoldable; + + def: Pat<(mstore addr:$ptr, _.KRCWM:$mask, (_.VT _.RC:$src)), + (!cast<Instruction>(BaseName#_.ZSuffix#mrk) addr:$ptr, + _.KRCWM:$mask, _.RC:$src)>; + + def : InstAlias<OpcodeStr#".s\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(BaseName#_.ZSuffix#"rr_REV") + _.RC:$dst, _.RC:$src), 0>; + def : InstAlias<OpcodeStr#".s\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}", + (!cast<Instruction>(BaseName#_.ZSuffix#"rrk_REV") + _.RC:$dst, _.KRCWM:$mask, _.RC:$src), 0>; + def : InstAlias<OpcodeStr#".s\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}", + (!cast<Instruction>(BaseName#_.ZSuffix#"rrkz_REV") + _.RC:$dst, _.KRCWM:$mask, _.RC:$src), 0>; +} + +multiclass avx512_store_vl< bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo _, Predicate prd, + X86SchedWriteMoveLSWidths Sched, + string EVEX2VEXOvrd, bit NoMRPattern = 0> { + let Predicates = [prd] in + defm Z : avx512_store<opc, OpcodeStr, NAME, _.info512, store, + masked_store_unaligned, Sched.ZMM, "", + NoMRPattern>, EVEX_V512; + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_store<opc, OpcodeStr, NAME, _.info256, store, + masked_store_unaligned, Sched.YMM, + EVEX2VEXOvrd#"Y", NoMRPattern>, EVEX_V256; + defm Z128 : avx512_store<opc, OpcodeStr, NAME, _.info128, store, + masked_store_unaligned, Sched.XMM, EVEX2VEXOvrd, + NoMRPattern>, EVEX_V128; + } +} + +multiclass avx512_alignedstore_vl<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo _, Predicate prd, + X86SchedWriteMoveLSWidths Sched, + string EVEX2VEXOvrd, bit NoMRPattern = 0> { + let Predicates = [prd] in + defm Z : avx512_store<opc, OpcodeStr, NAME, _.info512, alignedstore, + masked_store_aligned512, Sched.ZMM, "", + NoMRPattern>, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_store<opc, OpcodeStr, NAME, _.info256, alignedstore, + masked_store_aligned256, Sched.YMM, + EVEX2VEXOvrd#"Y", NoMRPattern>, EVEX_V256; + defm Z128 : avx512_store<opc, OpcodeStr, NAME, _.info128, alignedstore, + masked_store_aligned128, Sched.XMM, EVEX2VEXOvrd, + NoMRPattern>, EVEX_V128; + } +} + +defm VMOVAPS : avx512_alignedload_vl<0x28, "vmovaps", avx512vl_f32_info, + HasAVX512, SchedWriteFMoveLS, "VMOVAPS">, + avx512_alignedstore_vl<0x29, "vmovaps", avx512vl_f32_info, + HasAVX512, SchedWriteFMoveLS, "VMOVAPS">, + PS, EVEX_CD8<32, CD8VF>; + +defm VMOVAPD : avx512_alignedload_vl<0x28, "vmovapd", avx512vl_f64_info, + HasAVX512, SchedWriteFMoveLS, "VMOVAPD">, + avx512_alignedstore_vl<0x29, "vmovapd", avx512vl_f64_info, + HasAVX512, SchedWriteFMoveLS, "VMOVAPD">, + PD, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VMOVUPS : avx512_load_vl<0x10, "vmovups", avx512vl_f32_info, HasAVX512, + SchedWriteFMoveLS, "VMOVUPS", 0, null_frag>, + avx512_store_vl<0x11, "vmovups", avx512vl_f32_info, HasAVX512, + SchedWriteFMoveLS, "VMOVUPS">, + PS, EVEX_CD8<32, CD8VF>; + +defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512, + SchedWriteFMoveLS, "VMOVUPD", 0, null_frag>, + avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512, + SchedWriteFMoveLS, "VMOVUPD">, + PD, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VMOVDQA32 : avx512_alignedload_vl<0x6F, "vmovdqa32", avx512vl_i32_info, + HasAVX512, SchedWriteVecMoveLS, + "VMOVDQA", 1>, + avx512_alignedstore_vl<0x7F, "vmovdqa32", avx512vl_i32_info, + HasAVX512, SchedWriteVecMoveLS, + "VMOVDQA", 1>, + PD, EVEX_CD8<32, CD8VF>; + +defm VMOVDQA64 : avx512_alignedload_vl<0x6F, "vmovdqa64", avx512vl_i64_info, + HasAVX512, SchedWriteVecMoveLS, + "VMOVDQA">, + avx512_alignedstore_vl<0x7F, "vmovdqa64", avx512vl_i64_info, + HasAVX512, SchedWriteVecMoveLS, + "VMOVDQA">, + PD, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI, + SchedWriteVecMoveLS, "VMOVDQU", 1>, + avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info, HasBWI, + SchedWriteVecMoveLS, "VMOVDQU", 1>, + XD, EVEX_CD8<8, CD8VF>; + +defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI, + SchedWriteVecMoveLS, "VMOVDQU", 1>, + avx512_store_vl<0x7F, "vmovdqu16", avx512vl_i16_info, HasBWI, + SchedWriteVecMoveLS, "VMOVDQU", 1>, + XD, VEX_W, EVEX_CD8<16, CD8VF>; + +defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", avx512vl_i32_info, HasAVX512, + SchedWriteVecMoveLS, "VMOVDQU", 1, null_frag>, + avx512_store_vl<0x7F, "vmovdqu32", avx512vl_i32_info, HasAVX512, + SchedWriteVecMoveLS, "VMOVDQU", 1>, + XS, EVEX_CD8<32, CD8VF>; + +defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512, + SchedWriteVecMoveLS, "VMOVDQU", 0, null_frag>, + avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info, HasAVX512, + SchedWriteVecMoveLS, "VMOVDQU">, + XS, VEX_W, EVEX_CD8<64, CD8VF>; + +/* +// Special instructions to help with spilling when we don't have VLX. We need +// to load or store from a ZMM register instead. These are converted in +// expandPostRAPseudos. +let isReMaterializable = 1, canFoldAsLoad = 1, + isPseudo = 1, mayLoad = 1, hasSideEffects = 0 in { +def VMOVAPSZ128rm_NOVLX : I<0, Pseudo, (outs VR128X:$dst), (ins f128mem:$src), + "", []>, Sched<[WriteFLoadX]>; +def VMOVAPSZ256rm_NOVLX : I<0, Pseudo, (outs VR256X:$dst), (ins f256mem:$src), + "", []>, Sched<[WriteFLoadY]>; +def VMOVUPSZ128rm_NOVLX : I<0, Pseudo, (outs VR128X:$dst), (ins f128mem:$src), + "", []>, Sched<[WriteFLoadX]>; +def VMOVUPSZ256rm_NOVLX : I<0, Pseudo, (outs VR256X:$dst), (ins f256mem:$src), + "", []>, Sched<[WriteFLoadY]>; +} + +let isPseudo = 1, mayStore = 1, hasSideEffects = 0 in { +def VMOVAPSZ128mr_NOVLX : I<0, Pseudo, (outs), (ins f128mem:$dst, VR128X:$src), + "", []>, Sched<[WriteFStoreX]>; +def VMOVAPSZ256mr_NOVLX : I<0, Pseudo, (outs), (ins f256mem:$dst, VR256X:$src), + "", []>, Sched<[WriteFStoreY]>; +def VMOVUPSZ128mr_NOVLX : I<0, Pseudo, (outs), (ins f128mem:$dst, VR128X:$src), + "", []>, Sched<[WriteFStoreX]>; +def VMOVUPSZ256mr_NOVLX : I<0, Pseudo, (outs), (ins f256mem:$dst, VR256X:$src), + "", []>, Sched<[WriteFStoreY]>; +} +*/ + +def : Pat<(v8i64 (vselect VK8WM:$mask, (bc_v8i64 (v16i32 immAllZerosV)), + (v8i64 VR512:$src))), + (VMOVDQA64Zrrkz (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$mask, VK16)), + VK8), VR512:$src)>; + +def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 immAllZerosV), + (v16i32 VR512:$src))), + (VMOVDQA32Zrrkz (KNOTWrr VK16WM:$mask), VR512:$src)>; + +// These patterns exist to prevent the above patterns from introducing a second +// mask inversion when one already exists. +def : Pat<(v8i64 (vselect (xor VK8:$mask, (v8i1 immAllOnesV)), + (bc_v8i64 (v16i32 immAllZerosV)), + (v8i64 VR512:$src))), + (VMOVDQA64Zrrkz VK8:$mask, VR512:$src)>; +def : Pat<(v16i32 (vselect (xor VK16:$mask, (v16i1 immAllOnesV)), + (v16i32 immAllZerosV), + (v16i32 VR512:$src))), + (VMOVDQA32Zrrkz VK16WM:$mask, VR512:$src)>; + +multiclass mask_move_lowering<string InstrStr, X86VectorVTInfo Narrow, + X86VectorVTInfo Wide> { + def : Pat<(Narrow.VT (vselect (Narrow.KVT Narrow.KRCWM:$mask), + Narrow.RC:$src1, Narrow.RC:$src0)), + (EXTRACT_SUBREG + (Wide.VT + (!cast<Instruction>(InstrStr#"rrk") + (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src0, Narrow.SubRegIdx)), + (COPY_TO_REGCLASS Narrow.KRCWM:$mask, Wide.KRCWM), + (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)))), + Narrow.SubRegIdx)>; + + def : Pat<(Narrow.VT (vselect (Narrow.KVT Narrow.KRCWM:$mask), + Narrow.RC:$src1, Narrow.ImmAllZerosV)), + (EXTRACT_SUBREG + (Wide.VT + (!cast<Instruction>(InstrStr#"rrkz") + (COPY_TO_REGCLASS Narrow.KRCWM:$mask, Wide.KRCWM), + (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)))), + Narrow.SubRegIdx)>; +} + +// Patterns for handling v8i1 selects of 256-bit vectors when VLX isn't +// available. Use a 512-bit operation and extract. +let Predicates = [HasAVX512, NoVLX] in { + defm : mask_move_lowering<"VMOVAPSZ", v4f32x_info, v16f32_info>; + defm : mask_move_lowering<"VMOVDQA32Z", v4i32x_info, v16i32_info>; + defm : mask_move_lowering<"VMOVAPSZ", v8f32x_info, v16f32_info>; + defm : mask_move_lowering<"VMOVDQA32Z", v8i32x_info, v16i32_info>; + + defm : mask_move_lowering<"VMOVAPDZ", v2f64x_info, v8f64_info>; + defm : mask_move_lowering<"VMOVDQA64Z", v2i64x_info, v8i64_info>; + defm : mask_move_lowering<"VMOVAPDZ", v4f64x_info, v8f64_info>; + defm : mask_move_lowering<"VMOVDQA64Z", v4i64x_info, v8i64_info>; +} + +let Predicates = [HasBWI, NoVLX] in { + defm : mask_move_lowering<"VMOVDQU8Z", v16i8x_info, v64i8_info>; + defm : mask_move_lowering<"VMOVDQU8Z", v32i8x_info, v64i8_info>; + + defm : mask_move_lowering<"VMOVDQU16Z", v8i16x_info, v32i16_info>; + defm : mask_move_lowering<"VMOVDQU16Z", v16i16x_info, v32i16_info>; +} + +let Predicates = [HasAVX512] in { + // 512-bit store. + def : Pat<(alignedstore (v16i32 VR512:$src), addr:$dst), + (VMOVDQA64Zmr addr:$dst, VR512:$src)>; + def : Pat<(alignedstore (v32i16 VR512:$src), addr:$dst), + (VMOVDQA64Zmr addr:$dst, VR512:$src)>; + def : Pat<(alignedstore (v64i8 VR512:$src), addr:$dst), + (VMOVDQA64Zmr addr:$dst, VR512:$src)>; + def : Pat<(store (v16i32 VR512:$src), addr:$dst), + (VMOVDQU64Zmr addr:$dst, VR512:$src)>; + def : Pat<(store (v32i16 VR512:$src), addr:$dst), + (VMOVDQU64Zmr addr:$dst, VR512:$src)>; + def : Pat<(store (v64i8 VR512:$src), addr:$dst), + (VMOVDQU64Zmr addr:$dst, VR512:$src)>; +} + +let Predicates = [HasVLX] in { + // 128-bit store. + def : Pat<(alignedstore (v4i32 VR128X:$src), addr:$dst), + (VMOVDQA64Z128mr addr:$dst, VR128X:$src)>; + def : Pat<(alignedstore (v8i16 VR128X:$src), addr:$dst), + (VMOVDQA64Z128mr addr:$dst, VR128X:$src)>; + def : Pat<(alignedstore (v16i8 VR128X:$src), addr:$dst), + (VMOVDQA64Z128mr addr:$dst, VR128X:$src)>; + def : Pat<(store (v4i32 VR128X:$src), addr:$dst), + (VMOVDQU64Z128mr addr:$dst, VR128X:$src)>; + def : Pat<(store (v8i16 VR128X:$src), addr:$dst), + (VMOVDQU64Z128mr addr:$dst, VR128X:$src)>; + def : Pat<(store (v16i8 VR128X:$src), addr:$dst), + (VMOVDQU64Z128mr addr:$dst, VR128X:$src)>; + + // 256-bit store. + def : Pat<(alignedstore (v8i32 VR256X:$src), addr:$dst), + (VMOVDQA64Z256mr addr:$dst, VR256X:$src)>; + def : Pat<(alignedstore (v16i16 VR256X:$src), addr:$dst), + (VMOVDQA64Z256mr addr:$dst, VR256X:$src)>; + def : Pat<(alignedstore (v32i8 VR256X:$src), addr:$dst), + (VMOVDQA64Z256mr addr:$dst, VR256X:$src)>; + def : Pat<(store (v8i32 VR256X:$src), addr:$dst), + (VMOVDQU64Z256mr addr:$dst, VR256X:$src)>; + def : Pat<(store (v16i16 VR256X:$src), addr:$dst), + (VMOVDQU64Z256mr addr:$dst, VR256X:$src)>; + def : Pat<(store (v32i8 VR256X:$src), addr:$dst), + (VMOVDQU64Z256mr addr:$dst, VR256X:$src)>; +} + +multiclass masked_move_for_extract<string InstrStr, X86VectorVTInfo From, + X86VectorVTInfo To, X86VectorVTInfo Cast> { + def : Pat<(Cast.VT (vselect Cast.KRCWM:$mask, + (bitconvert + (To.VT (extract_subvector + (From.VT From.RC:$src), (iPTR 0)))), + To.RC:$src0)), + (Cast.VT (!cast<Instruction>(InstrStr#"rrk") + Cast.RC:$src0, Cast.KRCWM:$mask, + (To.VT (EXTRACT_SUBREG From.RC:$src, To.SubRegIdx))))>; + + def : Pat<(Cast.VT (vselect Cast.KRCWM:$mask, + (bitconvert + (To.VT (extract_subvector + (From.VT From.RC:$src), (iPTR 0)))), + Cast.ImmAllZerosV)), + (Cast.VT (!cast<Instruction>(InstrStr#"rrkz") + Cast.KRCWM:$mask, + (To.VT (EXTRACT_SUBREG From.RC:$src, To.SubRegIdx))))>; +} + + +let Predicates = [HasVLX] in { +// A masked extract from the first 128-bits of a 256-bit vector can be +// implemented with masked move. +defm : masked_move_for_extract<"VMOVDQA64Z128", v4i64x_info, v2i64x_info, v2i64x_info>; +defm : masked_move_for_extract<"VMOVDQA64Z128", v8i32x_info, v4i32x_info, v2i64x_info>; +defm : masked_move_for_extract<"VMOVDQA64Z128", v16i16x_info, v8i16x_info, v2i64x_info>; +defm : masked_move_for_extract<"VMOVDQA64Z128", v32i8x_info, v16i8x_info, v2i64x_info>; +defm : masked_move_for_extract<"VMOVDQA32Z128", v4i64x_info, v2i64x_info, v4i32x_info>; +defm : masked_move_for_extract<"VMOVDQA32Z128", v8i32x_info, v4i32x_info, v4i32x_info>; +defm : masked_move_for_extract<"VMOVDQA32Z128", v16i16x_info, v8i16x_info, v4i32x_info>; +defm : masked_move_for_extract<"VMOVDQA32Z128", v32i8x_info, v16i8x_info, v4i32x_info>; +defm : masked_move_for_extract<"VMOVAPDZ128", v4f64x_info, v2f64x_info, v2f64x_info>; +defm : masked_move_for_extract<"VMOVAPDZ128", v8f32x_info, v4f32x_info, v2f64x_info>; +defm : masked_move_for_extract<"VMOVAPSZ128", v4f64x_info, v2f64x_info, v4f32x_info>; +defm : masked_move_for_extract<"VMOVAPSZ128", v8f32x_info, v4f32x_info, v4f32x_info>; + +// A masked extract from the first 128-bits of a 512-bit vector can be +// implemented with masked move. +defm : masked_move_for_extract<"VMOVDQA64Z128", v8i64_info, v2i64x_info, v2i64x_info>; +defm : masked_move_for_extract<"VMOVDQA64Z128", v16i32_info, v4i32x_info, v2i64x_info>; +defm : masked_move_for_extract<"VMOVDQA64Z128", v32i16_info, v8i16x_info, v2i64x_info>; +defm : masked_move_for_extract<"VMOVDQA64Z128", v64i8_info, v16i8x_info, v2i64x_info>; +defm : masked_move_for_extract<"VMOVDQA32Z128", v8i64_info, v2i64x_info, v4i32x_info>; +defm : masked_move_for_extract<"VMOVDQA32Z128", v16i32_info, v4i32x_info, v4i32x_info>; +defm : masked_move_for_extract<"VMOVDQA32Z128", v32i16_info, v8i16x_info, v4i32x_info>; +defm : masked_move_for_extract<"VMOVDQA32Z128", v64i8_info, v16i8x_info, v4i32x_info>; +defm : masked_move_for_extract<"VMOVAPDZ128", v8f64_info, v2f64x_info, v2f64x_info>; +defm : masked_move_for_extract<"VMOVAPDZ128", v16f32_info, v4f32x_info, v2f64x_info>; +defm : masked_move_for_extract<"VMOVAPSZ128", v8f64_info, v2f64x_info, v4f32x_info>; +defm : masked_move_for_extract<"VMOVAPSZ128", v16f32_info, v4f32x_info, v4f32x_info>; + +// A masked extract from the first 256-bits of a 512-bit vector can be +// implemented with masked move. +defm : masked_move_for_extract<"VMOVDQA64Z256", v8i64_info, v4i64x_info, v4i64x_info>; +defm : masked_move_for_extract<"VMOVDQA64Z256", v16i32_info, v8i32x_info, v4i64x_info>; +defm : masked_move_for_extract<"VMOVDQA64Z256", v32i16_info, v16i16x_info, v4i64x_info>; +defm : masked_move_for_extract<"VMOVDQA64Z256", v64i8_info, v32i8x_info, v4i64x_info>; +defm : masked_move_for_extract<"VMOVDQA32Z256", v8i64_info, v4i64x_info, v8i32x_info>; +defm : masked_move_for_extract<"VMOVDQA32Z256", v16i32_info, v8i32x_info, v8i32x_info>; +defm : masked_move_for_extract<"VMOVDQA32Z256", v32i16_info, v16i16x_info, v8i32x_info>; +defm : masked_move_for_extract<"VMOVDQA32Z256", v64i8_info, v32i8x_info, v8i32x_info>; +defm : masked_move_for_extract<"VMOVAPDZ256", v8f64_info, v4f64x_info, v4f64x_info>; +defm : masked_move_for_extract<"VMOVAPDZ256", v16f32_info, v8f32x_info, v4f64x_info>; +defm : masked_move_for_extract<"VMOVAPSZ256", v8f64_info, v4f64x_info, v8f32x_info>; +defm : masked_move_for_extract<"VMOVAPSZ256", v16f32_info, v8f32x_info, v8f32x_info>; +} + +// Move Int Doubleword to Packed Double Int +// +let ExeDomain = SSEPackedInt in { +def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src), + "vmovd\t{$src, $dst|$dst, $src}", + [(set VR128X:$dst, + (v4i32 (scalar_to_vector GR32:$src)))]>, + EVEX, Sched<[WriteVecMoveFromGpr]>; +def VMOVDI2PDIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst), (ins i32mem:$src), + "vmovd\t{$src, $dst|$dst, $src}", + [(set VR128X:$dst, + (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>, + EVEX, EVEX_CD8<32, CD8VT1>, Sched<[WriteVecLoad]>; +def VMOV64toPQIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR64:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [(set VR128X:$dst, + (v2i64 (scalar_to_vector GR64:$src)))]>, + EVEX, VEX_W, Sched<[WriteVecMoveFromGpr]>; +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in +def VMOV64toPQIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst), + (ins i64mem:$src), + "vmovq\t{$src, $dst|$dst, $src}", []>, + EVEX, VEX_W, EVEX_CD8<64, CD8VT1>, Sched<[WriteVecLoad]>; +let isCodeGenOnly = 1 in { +def VMOV64toSDZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR64X:$dst), (ins GR64:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [(set FR64X:$dst, (bitconvert GR64:$src))]>, + EVEX, VEX_W, Sched<[WriteVecMoveFromGpr]>; +def VMOV64toSDZrm : AVX512XSI<0x7E, MRMSrcMem, (outs FR64X:$dst), (ins i64mem:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [(set FR64X:$dst, (bitconvert (loadi64 addr:$src)))]>, + EVEX, VEX_W, EVEX_CD8<8, CD8VT8>, Sched<[WriteVecLoad]>; +def VMOVSDto64Zrr : AVX512BI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64X:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (bitconvert FR64X:$src))]>, + EVEX, VEX_W, Sched<[WriteVecMoveFromGpr]>; +def VMOVSDto64Zmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64X:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [(store (i64 (bitconvert FR64X:$src)), addr:$dst)]>, + EVEX, VEX_W, Sched<[WriteVecStore]>, + EVEX_CD8<64, CD8VT1>; +} +} // ExeDomain = SSEPackedInt + +// Move Int Doubleword to Single Scalar +// +let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { +def VMOVDI2SSZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src), + "vmovd\t{$src, $dst|$dst, $src}", + [(set FR32X:$dst, (bitconvert GR32:$src))]>, + EVEX, Sched<[WriteVecMoveFromGpr]>; + +def VMOVDI2SSZrm : AVX512BI<0x6E, MRMSrcMem, (outs FR32X:$dst), (ins i32mem:$src), + "vmovd\t{$src, $dst|$dst, $src}", + [(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))]>, + EVEX, EVEX_CD8<32, CD8VT1>, Sched<[WriteVecLoad]>; +} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 + +// Move doubleword from xmm register to r/m32 +// +let ExeDomain = SSEPackedInt in { +def VMOVPDI2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$src), + "vmovd\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (extractelt (v4i32 VR128X:$src), + (iPTR 0)))]>, + EVEX, Sched<[WriteVecMoveToGpr]>; +def VMOVPDI2DIZmr : AVX512BI<0x7E, MRMDestMem, (outs), + (ins i32mem:$dst, VR128X:$src), + "vmovd\t{$src, $dst|$dst, $src}", + [(store (i32 (extractelt (v4i32 VR128X:$src), + (iPTR 0))), addr:$dst)]>, + EVEX, EVEX_CD8<32, CD8VT1>, Sched<[WriteVecStore]>; +} // ExeDomain = SSEPackedInt + +// Move quadword from xmm1 register to r/m64 +// +let ExeDomain = SSEPackedInt in { +def VMOVPQIto64Zrr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (extractelt (v2i64 VR128X:$src), + (iPTR 0)))]>, + PD, EVEX, VEX_W, Sched<[WriteVecMoveToGpr]>, + Requires<[HasAVX512]>; + +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in +def VMOVPQIto64Zmr : I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128X:$src), + "vmovq\t{$src, $dst|$dst, $src}", []>, PD, + EVEX, VEX_W, Sched<[WriteVecStore]>, + Requires<[HasAVX512, In64BitMode]>; + +def VMOVPQI2QIZmr : I<0xD6, MRMDestMem, (outs), + (ins i64mem:$dst, VR128X:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [(store (extractelt (v2i64 VR128X:$src), (iPTR 0)), + addr:$dst)]>, + EVEX, PD, VEX_W, EVEX_CD8<64, CD8VT1>, + Sched<[WriteVecStore]>, Requires<[HasAVX512]>; + +let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in +def VMOVPQI2QIZrr : AVX512BI<0xD6, MRMDestReg, (outs VR128X:$dst), + (ins VR128X:$src), + "vmovq\t{$src, $dst|$dst, $src}", []>, + EVEX, VEX_W, Sched<[SchedWriteVecLogic.XMM]>; +} // ExeDomain = SSEPackedInt + +def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}", + (VMOVPQI2QIZrr VR128X:$dst, VR128X:$src), 0>; + +// Move Scalar Single to Double Int +// +let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { +def VMOVSS2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), + (ins FR32X:$src), + "vmovd\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (bitconvert FR32X:$src))]>, + EVEX, Sched<[WriteVecMoveToGpr]>; +def VMOVSS2DIZmr : AVX512BI<0x7E, MRMDestMem, (outs), + (ins i32mem:$dst, FR32X:$src), + "vmovd\t{$src, $dst|$dst, $src}", + [(store (i32 (bitconvert FR32X:$src)), addr:$dst)]>, + EVEX, EVEX_CD8<32, CD8VT1>, Sched<[WriteVecStore]>; +} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 + +// Move Quadword Int to Packed Quadword Int +// +let ExeDomain = SSEPackedInt in { +def VMOVQI2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst), + (ins i64mem:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [(set VR128X:$dst, + (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, + EVEX, VEX_W, EVEX_CD8<8, CD8VT8>, Sched<[WriteVecLoad]>; +} // ExeDomain = SSEPackedInt + +// Allow "vmovd" but print "vmovq". +def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}", + (VMOV64toPQIZrr VR128X:$dst, GR64:$src), 0>; +def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}", + (VMOVPQIto64Zrr GR64:$dst, VR128X:$src), 0>; + +//===----------------------------------------------------------------------===// +// AVX-512 MOVSS, MOVSD +//===----------------------------------------------------------------------===// + +multiclass avx512_move_scalar<string asm, SDNode OpNode, + X86VectorVTInfo _> { + let Predicates = [HasAVX512, OptForSize] in + def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, _.RC:$src2)))], + _.ExeDomain>, EVEX_4V, Sched<[SchedWriteFShuffle.XMM]>; + def rrkz : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2), + !strconcat(asm, "\t{$src2, $src1, $dst {${mask}} {z}|", + "$dst {${mask}} {z}, $src1, $src2}"), + [(set _.RC:$dst, (_.VT (X86selects _.KRCWM:$mask, + (_.VT (OpNode _.RC:$src1, _.RC:$src2)), + _.ImmAllZerosV)))], + _.ExeDomain>, EVEX_4V, EVEX_KZ, Sched<[SchedWriteFShuffle.XMM]>; + let Constraints = "$src0 = $dst" in + def rrk : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst), + (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1, _.RC:$src2), + !strconcat(asm, "\t{$src2, $src1, $dst {${mask}}|", + "$dst {${mask}}, $src1, $src2}"), + [(set _.RC:$dst, (_.VT (X86selects _.KRCWM:$mask, + (_.VT (OpNode _.RC:$src1, _.RC:$src2)), + (_.VT _.RC:$src0))))], + _.ExeDomain>, EVEX_4V, EVEX_K, Sched<[SchedWriteFShuffle.XMM]>; + let canFoldAsLoad = 1, isReMaterializable = 1 in + def rm : AVX512PI<0x10, MRMSrcMem, (outs _.FRC:$dst), (ins _.ScalarMemOp:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), + [(set _.FRC:$dst, (_.ScalarLdFrag addr:$src))], + _.ExeDomain>, EVEX, Sched<[WriteFLoad]>; + let mayLoad = 1, hasSideEffects = 0 in { + let Constraints = "$src0 = $dst" in + def rmk : AVX512PI<0x10, MRMSrcMem, (outs _.RC:$dst), + (ins _.RC:$src0, _.KRCWM:$mask, _.ScalarMemOp:$src), + !strconcat(asm, "\t{$src, $dst {${mask}}|", + "$dst {${mask}}, $src}"), + [], _.ExeDomain>, EVEX, EVEX_K, Sched<[WriteFLoad]>; + def rmkz : AVX512PI<0x10, MRMSrcMem, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.ScalarMemOp:$src), + !strconcat(asm, "\t{$src, $dst {${mask}} {z}|", + "$dst {${mask}} {z}, $src}"), + [], _.ExeDomain>, EVEX, EVEX_KZ, Sched<[WriteFLoad]>; + } + def mr: AVX512PI<0x11, MRMDestMem, (outs), (ins _.ScalarMemOp:$dst, _.FRC:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), + [(store _.FRC:$src, addr:$dst)], _.ExeDomain>, + EVEX, Sched<[WriteFStore]>; + let mayStore = 1, hasSideEffects = 0 in + def mrk: AVX512PI<0x11, MRMDestMem, (outs), + (ins _.ScalarMemOp:$dst, VK1WM:$mask, _.FRC:$src), + !strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"), + [], _.ExeDomain>, EVEX, EVEX_K, Sched<[WriteFStore]>, + NotMemoryFoldable; +} + +defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, f32x_info>, + VEX_LIG, XS, EVEX_CD8<32, CD8VT1>; + +defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, f64x_info>, + VEX_LIG, XD, VEX_W, EVEX_CD8<64, CD8VT1>; + + +multiclass avx512_move_scalar_lowering<string InstrStr, SDNode OpNode, + PatLeaf ZeroFP, X86VectorVTInfo _> { + +def : Pat<(_.VT (OpNode _.RC:$src0, + (_.VT (scalar_to_vector + (_.EltVT (X86selects VK1WM:$mask, + (_.EltVT _.FRC:$src1), + (_.EltVT _.FRC:$src2))))))), + (!cast<Instruction>(InstrStr#rrk) + (_.VT (COPY_TO_REGCLASS _.FRC:$src2, _.RC)), + VK1WM:$mask, + (_.VT _.RC:$src0), + (_.VT (COPY_TO_REGCLASS _.FRC:$src1, _.RC)))>; + +def : Pat<(_.VT (OpNode _.RC:$src0, + (_.VT (scalar_to_vector + (_.EltVT (X86selects VK1WM:$mask, + (_.EltVT _.FRC:$src1), + (_.EltVT ZeroFP))))))), + (!cast<Instruction>(InstrStr#rrkz) + VK1WM:$mask, + (_.VT _.RC:$src0), + (_.VT (COPY_TO_REGCLASS _.FRC:$src1, _.RC)))>; +} + +multiclass avx512_store_scalar_lowering<string InstrStr, AVX512VLVectorVTInfo _, + dag Mask, RegisterClass MaskRC> { + +def : Pat<(masked_store addr:$dst, Mask, + (_.info512.VT (insert_subvector undef, + (_.info128.VT _.info128.RC:$src), + (iPTR 0)))), + (!cast<Instruction>(InstrStr#mrk) addr:$dst, + (COPY_TO_REGCLASS MaskRC:$mask, VK1WM), + (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>; + +} + +multiclass avx512_store_scalar_lowering_subreg<string InstrStr, + AVX512VLVectorVTInfo _, + dag Mask, RegisterClass MaskRC, + SubRegIndex subreg> { + +def : Pat<(masked_store addr:$dst, Mask, + (_.info512.VT (insert_subvector undef, + (_.info128.VT _.info128.RC:$src), + (iPTR 0)))), + (!cast<Instruction>(InstrStr#mrk) addr:$dst, + (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM), + (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>; + +} + +// This matches the more recent codegen from clang that avoids emitting a 512 +// bit masked store directly. Codegen will widen 128-bit masked store to 512 +// bits on AVX512F only targets. +multiclass avx512_store_scalar_lowering_subreg2<string InstrStr, + AVX512VLVectorVTInfo _, + dag Mask512, dag Mask128, + RegisterClass MaskRC, + SubRegIndex subreg> { + +// AVX512F pattern. +def : Pat<(masked_store addr:$dst, Mask512, + (_.info512.VT (insert_subvector undef, + (_.info128.VT _.info128.RC:$src), + (iPTR 0)))), + (!cast<Instruction>(InstrStr#mrk) addr:$dst, + (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM), + (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>; + +// AVX512VL pattern. +def : Pat<(masked_store addr:$dst, Mask128, (_.info128.VT _.info128.RC:$src)), + (!cast<Instruction>(InstrStr#mrk) addr:$dst, + (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM), + (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>; +} + +multiclass avx512_load_scalar_lowering<string InstrStr, AVX512VLVectorVTInfo _, + dag Mask, RegisterClass MaskRC> { + +def : Pat<(_.info128.VT (extract_subvector + (_.info512.VT (masked_load addr:$srcAddr, Mask, + (_.info512.VT (bitconvert + (v16i32 immAllZerosV))))), + (iPTR 0))), + (!cast<Instruction>(InstrStr#rmkz) + (COPY_TO_REGCLASS MaskRC:$mask, VK1WM), + addr:$srcAddr)>; + +def : Pat<(_.info128.VT (extract_subvector + (_.info512.VT (masked_load addr:$srcAddr, Mask, + (_.info512.VT (insert_subvector undef, + (_.info128.VT (X86vzmovl _.info128.RC:$src)), + (iPTR 0))))), + (iPTR 0))), + (!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src, + (COPY_TO_REGCLASS MaskRC:$mask, VK1WM), + addr:$srcAddr)>; + +} + +multiclass avx512_load_scalar_lowering_subreg<string InstrStr, + AVX512VLVectorVTInfo _, + dag Mask, RegisterClass MaskRC, + SubRegIndex subreg> { + +def : Pat<(_.info128.VT (extract_subvector + (_.info512.VT (masked_load addr:$srcAddr, Mask, + (_.info512.VT (bitconvert + (v16i32 immAllZerosV))))), + (iPTR 0))), + (!cast<Instruction>(InstrStr#rmkz) + (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM), + addr:$srcAddr)>; + +def : Pat<(_.info128.VT (extract_subvector + (_.info512.VT (masked_load addr:$srcAddr, Mask, + (_.info512.VT (insert_subvector undef, + (_.info128.VT (X86vzmovl _.info128.RC:$src)), + (iPTR 0))))), + (iPTR 0))), + (!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src, + (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM), + addr:$srcAddr)>; + +} + +// This matches the more recent codegen from clang that avoids emitting a 512 +// bit masked load directly. Codegen will widen 128-bit masked load to 512 +// bits on AVX512F only targets. +multiclass avx512_load_scalar_lowering_subreg2<string InstrStr, + AVX512VLVectorVTInfo _, + dag Mask512, dag Mask128, + RegisterClass MaskRC, + SubRegIndex subreg> { +// AVX512F patterns. +def : Pat<(_.info128.VT (extract_subvector + (_.info512.VT (masked_load addr:$srcAddr, Mask512, + (_.info512.VT (bitconvert + (v16i32 immAllZerosV))))), + (iPTR 0))), + (!cast<Instruction>(InstrStr#rmkz) + (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM), + addr:$srcAddr)>; + +def : Pat<(_.info128.VT (extract_subvector + (_.info512.VT (masked_load addr:$srcAddr, Mask512, + (_.info512.VT (insert_subvector undef, + (_.info128.VT (X86vzmovl _.info128.RC:$src)), + (iPTR 0))))), + (iPTR 0))), + (!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src, + (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM), + addr:$srcAddr)>; + +// AVX512Vl patterns. +def : Pat<(_.info128.VT (masked_load addr:$srcAddr, Mask128, + (_.info128.VT (bitconvert (v4i32 immAllZerosV))))), + (!cast<Instruction>(InstrStr#rmkz) + (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM), + addr:$srcAddr)>; + +def : Pat<(_.info128.VT (masked_load addr:$srcAddr, Mask128, + (_.info128.VT (X86vzmovl _.info128.RC:$src)))), + (!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src, + (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM), + addr:$srcAddr)>; +} + +defm : avx512_move_scalar_lowering<"VMOVSSZ", X86Movss, fp32imm0, v4f32x_info>; +defm : avx512_move_scalar_lowering<"VMOVSDZ", X86Movsd, fp64imm0, v2f64x_info>; + +defm : avx512_store_scalar_lowering<"VMOVSSZ", avx512vl_f32_info, + (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>; +defm : avx512_store_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info, + (v16i1 (bitconvert (i16 (and GR16:$mask, (i16 1))))), GR16, sub_16bit>; +defm : avx512_store_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info, + (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>; + +defm : avx512_store_scalar_lowering_subreg2<"VMOVSSZ", avx512vl_f32_info, + (v16i1 (insert_subvector + (v16i1 immAllZerosV), + (v4i1 (extract_subvector + (v8i1 (bitconvert (and GR8:$mask, (i8 1)))), + (iPTR 0))), + (iPTR 0))), + (v4i1 (extract_subvector + (v8i1 (bitconvert (and GR8:$mask, (i8 1)))), + (iPTR 0))), GR8, sub_8bit>; +defm : avx512_store_scalar_lowering_subreg2<"VMOVSDZ", avx512vl_f64_info, + (v8i1 + (extract_subvector + (v16i1 + (insert_subvector + (v16i1 immAllZerosV), + (v2i1 (extract_subvector + (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), + (iPTR 0))), + (iPTR 0))), + (iPTR 0))), + (v2i1 (extract_subvector + (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), + (iPTR 0))), GR8, sub_8bit>; + +defm : avx512_load_scalar_lowering<"VMOVSSZ", avx512vl_f32_info, + (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>; +defm : avx512_load_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info, + (v16i1 (bitconvert (i16 (and GR16:$mask, (i16 1))))), GR16, sub_16bit>; +defm : avx512_load_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info, + (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>; + +defm : avx512_load_scalar_lowering_subreg2<"VMOVSSZ", avx512vl_f32_info, + (v16i1 (insert_subvector + (v16i1 immAllZerosV), + (v4i1 (extract_subvector + (v8i1 (bitconvert (and GR8:$mask, (i8 1)))), + (iPTR 0))), + (iPTR 0))), + (v4i1 (extract_subvector + (v8i1 (bitconvert (and GR8:$mask, (i8 1)))), + (iPTR 0))), GR8, sub_8bit>; +defm : avx512_load_scalar_lowering_subreg2<"VMOVSDZ", avx512vl_f64_info, + (v8i1 + (extract_subvector + (v16i1 + (insert_subvector + (v16i1 immAllZerosV), + (v2i1 (extract_subvector + (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), + (iPTR 0))), + (iPTR 0))), + (iPTR 0))), + (v2i1 (extract_subvector + (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), + (iPTR 0))), GR8, sub_8bit>; + +def : Pat<(f32 (X86selects VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))), + (COPY_TO_REGCLASS (v4f32 (VMOVSSZrrk + (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)), + VK1WM:$mask, (v4f32 (IMPLICIT_DEF)), + (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)))), FR32X)>; + +def : Pat<(f32 (X86selects VK1WM:$mask, (f32 FR32X:$src1), fp32imm0)), + (COPY_TO_REGCLASS (v4f32 (VMOVSSZrrkz VK1WM:$mask, (v4f32 (IMPLICIT_DEF)), + (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)))), FR32X)>; + +def : Pat<(f64 (X86selects VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))), + (COPY_TO_REGCLASS (v2f64 (VMOVSDZrrk + (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)), + VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), + (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)))), FR64X)>; + +def : Pat<(f64 (X86selects VK1WM:$mask, (f64 FR64X:$src1), fpimm0)), + (COPY_TO_REGCLASS (v2f64 (VMOVSDZrrkz VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), + (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)))), FR64X)>; + +let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in { + def VMOVSSZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), + (ins VR128X:$src1, VR128X:$src2), + "vmovss\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, XS, EVEX_4V, VEX_LIG, + FoldGenData<"VMOVSSZrr">, + Sched<[SchedWriteFShuffle.XMM]>; + + let Constraints = "$src0 = $dst" in + def VMOVSSZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), + (ins f32x_info.RC:$src0, f32x_info.KRCWM:$mask, + VR128X:$src1, VR128X:$src2), + "vmovss\t{$src2, $src1, $dst {${mask}}|"# + "$dst {${mask}}, $src1, $src2}", + []>, EVEX_K, XS, EVEX_4V, VEX_LIG, + FoldGenData<"VMOVSSZrrk">, + Sched<[SchedWriteFShuffle.XMM]>; + + def VMOVSSZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), + (ins f32x_info.KRCWM:$mask, VR128X:$src1, VR128X:$src2), + "vmovss\t{$src2, $src1, $dst {${mask}} {z}|"# + "$dst {${mask}} {z}, $src1, $src2}", + []>, EVEX_KZ, XS, EVEX_4V, VEX_LIG, + FoldGenData<"VMOVSSZrrkz">, + Sched<[SchedWriteFShuffle.XMM]>; + + def VMOVSDZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), + (ins VR128X:$src1, VR128X:$src2), + "vmovsd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, XD, EVEX_4V, VEX_LIG, VEX_W, + FoldGenData<"VMOVSDZrr">, + Sched<[SchedWriteFShuffle.XMM]>; + + let Constraints = "$src0 = $dst" in + def VMOVSDZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), + (ins f64x_info.RC:$src0, f64x_info.KRCWM:$mask, + VR128X:$src1, VR128X:$src2), + "vmovsd\t{$src2, $src1, $dst {${mask}}|"# + "$dst {${mask}}, $src1, $src2}", + []>, EVEX_K, XD, EVEX_4V, VEX_LIG, + VEX_W, FoldGenData<"VMOVSDZrrk">, + Sched<[SchedWriteFShuffle.XMM]>; + + def VMOVSDZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), + (ins f64x_info.KRCWM:$mask, VR128X:$src1, + VR128X:$src2), + "vmovsd\t{$src2, $src1, $dst {${mask}} {z}|"# + "$dst {${mask}} {z}, $src1, $src2}", + []>, EVEX_KZ, XD, EVEX_4V, VEX_LIG, + VEX_W, FoldGenData<"VMOVSDZrrkz">, + Sched<[SchedWriteFShuffle.XMM]>; +} + +def : InstAlias<"vmovss.s\t{$src2, $src1, $dst|$dst, $src1, $src2}", + (VMOVSSZrr_REV VR128X:$dst, VR128X:$src1, VR128X:$src2), 0>; +def : InstAlias<"vmovss.s\t{$src2, $src1, $dst {${mask}}|"# + "$dst {${mask}}, $src1, $src2}", + (VMOVSSZrrk_REV VR128X:$dst, VK1WM:$mask, + VR128X:$src1, VR128X:$src2), 0>; +def : InstAlias<"vmovss.s\t{$src2, $src1, $dst {${mask}} {z}|"# + "$dst {${mask}} {z}, $src1, $src2}", + (VMOVSSZrrkz_REV VR128X:$dst, VK1WM:$mask, + VR128X:$src1, VR128X:$src2), 0>; +def : InstAlias<"vmovsd.s\t{$src2, $src1, $dst|$dst, $src1, $src2}", + (VMOVSDZrr_REV VR128X:$dst, VR128X:$src1, VR128X:$src2), 0>; +def : InstAlias<"vmovsd.s\t{$src2, $src1, $dst {${mask}}|"# + "$dst {${mask}}, $src1, $src2}", + (VMOVSDZrrk_REV VR128X:$dst, VK1WM:$mask, + VR128X:$src1, VR128X:$src2), 0>; +def : InstAlias<"vmovsd.s\t{$src2, $src1, $dst {${mask}} {z}|"# + "$dst {${mask}} {z}, $src1, $src2}", + (VMOVSDZrrkz_REV VR128X:$dst, VK1WM:$mask, + VR128X:$src1, VR128X:$src2), 0>; + +let Predicates = [HasAVX512, OptForSize] in { + def : Pat<(v4f32 (X86vzmovl (v4f32 VR128X:$src))), + (VMOVSSZrr (v4f32 (AVX512_128_SET0)), VR128X:$src)>; + def : Pat<(v4i32 (X86vzmovl (v4i32 VR128X:$src))), + (VMOVSSZrr (v4i32 (AVX512_128_SET0)), VR128X:$src)>; + + // Move low f32 and clear high bits. + def : Pat<(v8f32 (X86vzmovl (v8f32 VR256X:$src))), + (SUBREG_TO_REG (i32 0), + (v4f32 (VMOVSSZrr (v4f32 (AVX512_128_SET0)), + (v4f32 (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)))), sub_xmm)>; + def : Pat<(v8i32 (X86vzmovl (v8i32 VR256X:$src))), + (SUBREG_TO_REG (i32 0), + (v4i32 (VMOVSSZrr (v4i32 (AVX512_128_SET0)), + (v4i32 (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm)))), sub_xmm)>; + + def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))), + (SUBREG_TO_REG (i32 0), + (v2f64 (VMOVSDZrr (v2f64 (AVX512_128_SET0)), + (v2f64 (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)))), sub_xmm)>; + def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))), + (SUBREG_TO_REG (i32 0), + (v2i64 (VMOVSDZrr (v2i64 (AVX512_128_SET0)), + (v2i64 (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)))), sub_xmm)>; + + def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))), + (SUBREG_TO_REG (i32 0), + (v4f32 (VMOVSSZrr (v4f32 (AVX512_128_SET0)), + (v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm)))), sub_xmm)>; + def : Pat<(v16i32 (X86vzmovl (v16i32 VR512:$src))), + (SUBREG_TO_REG (i32 0), + (v4i32 (VMOVSSZrr (v4i32 (AVX512_128_SET0)), + (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)))), sub_xmm)>; + + def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))), + (SUBREG_TO_REG (i32 0), + (v2f64 (VMOVSDZrr (v2f64 (AVX512_128_SET0)), + (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)))), sub_xmm)>; + + def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))), + (SUBREG_TO_REG (i32 0), + (v2i64 (VMOVSDZrr (v2i64 (AVX512_128_SET0)), + (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)))), sub_xmm)>; + +} + +// Use 128-bit blends for OptForSpeed since BLENDs have better throughput than +// VMOVSS/SD. Unfortunately, loses the ability to use XMM16-31. +let Predicates = [HasAVX512, OptForSpeed] in { + def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))), + (SUBREG_TO_REG (i32 0), + (v4f32 (VBLENDPSrri (v4f32 (V_SET0)), + (v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm)), + (i8 1))), sub_xmm)>; + def : Pat<(v16i32 (X86vzmovl (v16i32 VR512:$src))), + (SUBREG_TO_REG (i32 0), + (v4i32 (VPBLENDWrri (v4i32 (V_SET0)), + (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)), + (i8 3))), sub_xmm)>; + + def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))), + (SUBREG_TO_REG (i32 0), + (v2f64 (VBLENDPDrri (v2f64 (V_SET0)), + (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)), + (i8 1))), sub_xmm)>; + def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))), + (SUBREG_TO_REG (i32 0), + (v2i64 (VPBLENDWrri (v2i64 (V_SET0)), + (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)), + (i8 0xf))), sub_xmm)>; +} + +let Predicates = [HasAVX512] in { + + // MOVSSrm zeros the high parts of the register; represent this + // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 + def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))), + (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>; + def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))), + (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>; + def : Pat<(v4f32 (X86vzload addr:$src)), + (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>; + + // MOVSDrm zeros the high parts of the register; represent this + // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 + def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))), + (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>; + def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))), + (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>; + def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))), + (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>; + def : Pat<(v2f64 (X86vzload addr:$src)), + (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>; + + // Represent the same patterns above but in the form they appear for + // 256-bit types + def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, + (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>; + def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, + (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>; + def : Pat<(v8f32 (X86vzload addr:$src)), + (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>; + def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, + (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>; + def : Pat<(v4f64 (X86vzload addr:$src)), + (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>; + + // Represent the same patterns above but in the form they appear for + // 512-bit types + def : Pat<(v16i32 (X86vzmovl (insert_subvector undef, + (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>; + def : Pat<(v16f32 (X86vzmovl (insert_subvector undef, + (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>; + def : Pat<(v16f32 (X86vzload addr:$src)), + (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>; + def : Pat<(v8f64 (X86vzmovl (insert_subvector undef, + (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>; + def : Pat<(v8f64 (X86vzload addr:$src)), + (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>; + + def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, + (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))), + (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>; + + // Extract and store. + def : Pat<(store (f32 (extractelt (v4f32 VR128X:$src), (iPTR 0))), + addr:$dst), + (VMOVSSZmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128X:$src), FR32X))>; +} + +let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in { +def VMOVZPQILo2PQIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst), + (ins VR128X:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [(set VR128X:$dst, (v2i64 (X86vzmovl + (v2i64 VR128X:$src))))]>, + EVEX, VEX_W; +} + +let Predicates = [HasAVX512] in { + def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), + (VMOVDI2PDIZrr GR32:$src)>; + + def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), + (VMOV64toPQIZrr GR64:$src)>; + + def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, + (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), + (SUBREG_TO_REG (i64 0), (v2i64 (VMOV64toPQIZrr GR64:$src)), sub_xmm)>; + + def : Pat<(v8i64 (X86vzmovl (insert_subvector undef, + (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), + (SUBREG_TO_REG (i64 0), (v2i64 (VMOV64toPQIZrr GR64:$src)), sub_xmm)>; + + // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part. + def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))), + (VMOVDI2PDIZrm addr:$src)>; + def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), + (VMOVDI2PDIZrm addr:$src)>; + def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), + (VMOVDI2PDIZrm addr:$src)>; + def : Pat<(v4i32 (X86vzload addr:$src)), + (VMOVDI2PDIZrm addr:$src)>; + def : Pat<(v8i32 (X86vzload addr:$src)), + (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>; + def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), + (VMOVQI2PQIZrm addr:$src)>; + def : Pat<(v2f64 (X86vzmovl (v2f64 VR128X:$src))), + (VMOVZPQILo2PQIZrr VR128X:$src)>; + def : Pat<(v2i64 (X86vzload addr:$src)), + (VMOVQI2PQIZrm addr:$src)>; + def : Pat<(v4i64 (X86vzload addr:$src)), + (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>; + + // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext. + def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, + (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))), + (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrr GR32:$src)), sub_xmm)>; + def : Pat<(v16i32 (X86vzmovl (insert_subvector undef, + (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))), + (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrr GR32:$src)), sub_xmm)>; + + // Use regular 128-bit instructions to match 512-bit scalar_to_vec+zext. + def : Pat<(v16i32 (X86vzload addr:$src)), + (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>; + def : Pat<(v8i64 (X86vzload addr:$src)), + (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>; +} + +//===----------------------------------------------------------------------===// +// AVX-512 - Non-temporals +//===----------------------------------------------------------------------===// + +def VMOVNTDQAZrm : AVX512PI<0x2A, MRMSrcMem, (outs VR512:$dst), + (ins i512mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}", + [], SSEPackedInt>, Sched<[SchedWriteVecMoveLS.ZMM.RM]>, + EVEX, T8PD, EVEX_V512, EVEX_CD8<64, CD8VF>; + +let Predicates = [HasVLX] in { + def VMOVNTDQAZ256rm : AVX512PI<0x2A, MRMSrcMem, (outs VR256X:$dst), + (ins i256mem:$src), + "vmovntdqa\t{$src, $dst|$dst, $src}", + [], SSEPackedInt>, Sched<[SchedWriteVecMoveLS.YMM.RM]>, + EVEX, T8PD, EVEX_V256, EVEX_CD8<64, CD8VF>; + + def VMOVNTDQAZ128rm : AVX512PI<0x2A, MRMSrcMem, (outs VR128X:$dst), + (ins i128mem:$src), + "vmovntdqa\t{$src, $dst|$dst, $src}", + [], SSEPackedInt>, Sched<[SchedWriteVecMoveLS.XMM.RM]>, + EVEX, T8PD, EVEX_V128, EVEX_CD8<64, CD8VF>; +} + +multiclass avx512_movnt<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + X86SchedWriteMoveLS Sched, + PatFrag st_frag = alignednontemporalstore> { + let SchedRW = [Sched.MR], AddedComplexity = 400 in + def mr : AVX512PI<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.RC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(st_frag (_.VT _.RC:$src), addr:$dst)], + _.ExeDomain>, EVEX, EVEX_CD8<_.EltSize, CD8VF>; +} + +multiclass avx512_movnt_vl<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo, + X86SchedWriteMoveLSWidths Sched> { + let Predicates = [HasAVX512] in + defm Z : avx512_movnt<opc, OpcodeStr, VTInfo.info512, Sched.ZMM>, EVEX_V512; + + let Predicates = [HasAVX512, HasVLX] in { + defm Z256 : avx512_movnt<opc, OpcodeStr, VTInfo.info256, Sched.YMM>, EVEX_V256; + defm Z128 : avx512_movnt<opc, OpcodeStr, VTInfo.info128, Sched.XMM>, EVEX_V128; + } +} + +defm VMOVNTDQ : avx512_movnt_vl<0xE7, "vmovntdq", avx512vl_i64_info, + SchedWriteVecMoveLSNT>, PD; +defm VMOVNTPD : avx512_movnt_vl<0x2B, "vmovntpd", avx512vl_f64_info, + SchedWriteFMoveLSNT>, PD, VEX_W; +defm VMOVNTPS : avx512_movnt_vl<0x2B, "vmovntps", avx512vl_f32_info, + SchedWriteFMoveLSNT>, PS; + +let Predicates = [HasAVX512], AddedComplexity = 400 in { + def : Pat<(alignednontemporalstore (v16i32 VR512:$src), addr:$dst), + (VMOVNTDQZmr addr:$dst, VR512:$src)>; + def : Pat<(alignednontemporalstore (v32i16 VR512:$src), addr:$dst), + (VMOVNTDQZmr addr:$dst, VR512:$src)>; + def : Pat<(alignednontemporalstore (v64i8 VR512:$src), addr:$dst), + (VMOVNTDQZmr addr:$dst, VR512:$src)>; + + def : Pat<(v8f64 (alignednontemporalload addr:$src)), + (VMOVNTDQAZrm addr:$src)>; + def : Pat<(v16f32 (alignednontemporalload addr:$src)), + (VMOVNTDQAZrm addr:$src)>; + def : Pat<(v8i64 (alignednontemporalload addr:$src)), + (VMOVNTDQAZrm addr:$src)>; +} + +let Predicates = [HasVLX], AddedComplexity = 400 in { + def : Pat<(alignednontemporalstore (v8i32 VR256X:$src), addr:$dst), + (VMOVNTDQZ256mr addr:$dst, VR256X:$src)>; + def : Pat<(alignednontemporalstore (v16i16 VR256X:$src), addr:$dst), + (VMOVNTDQZ256mr addr:$dst, VR256X:$src)>; + def : Pat<(alignednontemporalstore (v32i8 VR256X:$src), addr:$dst), + (VMOVNTDQZ256mr addr:$dst, VR256X:$src)>; + + def : Pat<(v4f64 (alignednontemporalload addr:$src)), + (VMOVNTDQAZ256rm addr:$src)>; + def : Pat<(v8f32 (alignednontemporalload addr:$src)), + (VMOVNTDQAZ256rm addr:$src)>; + def : Pat<(v4i64 (alignednontemporalload addr:$src)), + (VMOVNTDQAZ256rm addr:$src)>; + + def : Pat<(alignednontemporalstore (v4i32 VR128X:$src), addr:$dst), + (VMOVNTDQZ128mr addr:$dst, VR128X:$src)>; + def : Pat<(alignednontemporalstore (v8i16 VR128X:$src), addr:$dst), + (VMOVNTDQZ128mr addr:$dst, VR128X:$src)>; + def : Pat<(alignednontemporalstore (v16i8 VR128X:$src), addr:$dst), + (VMOVNTDQZ128mr addr:$dst, VR128X:$src)>; + + def : Pat<(v2f64 (alignednontemporalload addr:$src)), + (VMOVNTDQAZ128rm addr:$src)>; + def : Pat<(v4f32 (alignednontemporalload addr:$src)), + (VMOVNTDQAZ128rm addr:$src)>; + def : Pat<(v2i64 (alignednontemporalload addr:$src)), + (VMOVNTDQAZ128rm addr:$src)>; +} + +//===----------------------------------------------------------------------===// +// AVX-512 - Integer arithmetic +// +multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _, X86FoldableSchedWrite sched, + bit IsCommutable = 0> { + defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, _.RC:$src2)), + IsCommutable>, AVX512BIBase, EVEX_4V, + Sched<[sched]>; + + defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, + (bitconvert (_.LdFrag addr:$src2))))>, + AVX512BIBase, EVEX_4V, + Sched<[sched.Folded, ReadAfterLd]>; +} + +multiclass avx512_binop_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _, X86FoldableSchedWrite sched, + bit IsCommutable = 0> : + avx512_binop_rm<opc, OpcodeStr, OpNode, _, sched, IsCommutable> { + defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, + "${src2}"##_.BroadcastStr##", $src1", + "$src1, ${src2}"##_.BroadcastStr, + (_.VT (OpNode _.RC:$src1, + (X86VBroadcast + (_.ScalarLdFrag addr:$src2))))>, + AVX512BIBase, EVEX_4V, EVEX_B, + Sched<[sched.Folded, ReadAfterLd]>; +} + +multiclass avx512_binop_rm_vl<bits<8> opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo, + X86SchedWriteWidths sched, Predicate prd, + bit IsCommutable = 0> { + let Predicates = [prd] in + defm Z : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info512, sched.ZMM, + IsCommutable>, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info256, + sched.YMM, IsCommutable>, EVEX_V256; + defm Z128 : avx512_binop_rm<opc, OpcodeStr, OpNode, VTInfo.info128, + sched.XMM, IsCommutable>, EVEX_V128; + } +} + +multiclass avx512_binop_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo, + X86SchedWriteWidths sched, Predicate prd, + bit IsCommutable = 0> { + let Predicates = [prd] in + defm Z : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info512, sched.ZMM, + IsCommutable>, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info256, + sched.YMM, IsCommutable>, EVEX_V256; + defm Z128 : avx512_binop_rmb<opc, OpcodeStr, OpNode, VTInfo.info128, + sched.XMM, IsCommutable>, EVEX_V128; + } +} + +multiclass avx512_binop_rm_vl_q<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86SchedWriteWidths sched, Predicate prd, + bit IsCommutable = 0> { + defm NAME : avx512_binop_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i64_info, + sched, prd, IsCommutable>, + VEX_W, EVEX_CD8<64, CD8VF>; +} + +multiclass avx512_binop_rm_vl_d<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86SchedWriteWidths sched, Predicate prd, + bit IsCommutable = 0> { + defm NAME : avx512_binop_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i32_info, + sched, prd, IsCommutable>, EVEX_CD8<32, CD8VF>; +} + +multiclass avx512_binop_rm_vl_w<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86SchedWriteWidths sched, Predicate prd, + bit IsCommutable = 0> { + defm NAME : avx512_binop_rm_vl<opc, OpcodeStr, OpNode, avx512vl_i16_info, + sched, prd, IsCommutable>, EVEX_CD8<16, CD8VF>, + VEX_WIG; +} + +multiclass avx512_binop_rm_vl_b<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86SchedWriteWidths sched, Predicate prd, + bit IsCommutable = 0> { + defm NAME : avx512_binop_rm_vl<opc, OpcodeStr, OpNode, avx512vl_i8_info, + sched, prd, IsCommutable>, EVEX_CD8<8, CD8VF>, + VEX_WIG; +} + +multiclass avx512_binop_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr, + SDNode OpNode, X86SchedWriteWidths sched, + Predicate prd, bit IsCommutable = 0> { + defm Q : avx512_binop_rm_vl_q<opc_q, OpcodeStr#"q", OpNode, sched, prd, + IsCommutable>; + + defm D : avx512_binop_rm_vl_d<opc_d, OpcodeStr#"d", OpNode, sched, prd, + IsCommutable>; +} + +multiclass avx512_binop_rm_vl_bw<bits<8> opc_b, bits<8> opc_w, string OpcodeStr, + SDNode OpNode, X86SchedWriteWidths sched, + Predicate prd, bit IsCommutable = 0> { + defm W : avx512_binop_rm_vl_w<opc_w, OpcodeStr#"w", OpNode, sched, prd, + IsCommutable>; + + defm B : avx512_binop_rm_vl_b<opc_b, OpcodeStr#"b", OpNode, sched, prd, + IsCommutable>; +} + +multiclass avx512_binop_rm_vl_all<bits<8> opc_b, bits<8> opc_w, + bits<8> opc_d, bits<8> opc_q, + string OpcodeStr, SDNode OpNode, + X86SchedWriteWidths sched, + bit IsCommutable = 0> { + defm NAME : avx512_binop_rm_vl_dq<opc_d, opc_q, OpcodeStr, OpNode, + sched, HasAVX512, IsCommutable>, + avx512_binop_rm_vl_bw<opc_b, opc_w, OpcodeStr, OpNode, + sched, HasBWI, IsCommutable>; +} + +multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, + X86FoldableSchedWrite sched, + SDNode OpNode,X86VectorVTInfo _Src, + X86VectorVTInfo _Dst, X86VectorVTInfo _Brdct, + bit IsCommutable = 0> { + defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst), + (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr, + "$src2, $src1","$src1, $src2", + (_Dst.VT (OpNode + (_Src.VT _Src.RC:$src1), + (_Src.VT _Src.RC:$src2))), + IsCommutable>, + AVX512BIBase, EVEX_4V, Sched<[sched]>; + defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), + (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), + (bitconvert (_Src.LdFrag addr:$src2))))>, + AVX512BIBase, EVEX_4V, + Sched<[sched.Folded, ReadAfterLd]>; + + defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), + (ins _Src.RC:$src1, _Brdct.ScalarMemOp:$src2), + OpcodeStr, + "${src2}"##_Brdct.BroadcastStr##", $src1", + "$src1, ${src2}"##_Brdct.BroadcastStr, + (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert + (_Brdct.VT (X86VBroadcast + (_Brdct.ScalarLdFrag addr:$src2))))))>, + AVX512BIBase, EVEX_4V, EVEX_B, + Sched<[sched.Folded, ReadAfterLd]>; +} + +defm VPADD : avx512_binop_rm_vl_all<0xFC, 0xFD, 0xFE, 0xD4, "vpadd", add, + SchedWriteVecALU, 1>; +defm VPSUB : avx512_binop_rm_vl_all<0xF8, 0xF9, 0xFA, 0xFB, "vpsub", sub, + SchedWriteVecALU, 0>; +defm VPADDS : avx512_binop_rm_vl_bw<0xEC, 0xED, "vpadds", X86adds, + SchedWriteVecALU, HasBWI, 1>; +defm VPSUBS : avx512_binop_rm_vl_bw<0xE8, 0xE9, "vpsubs", X86subs, + SchedWriteVecALU, HasBWI, 0>; +defm VPADDUS : avx512_binop_rm_vl_bw<0xDC, 0xDD, "vpaddus", X86addus, + SchedWriteVecALU, HasBWI, 1>; +defm VPSUBUS : avx512_binop_rm_vl_bw<0xD8, 0xD9, "vpsubus", X86subus, + SchedWriteVecALU, HasBWI, 0>; +defm VPMULLD : avx512_binop_rm_vl_d<0x40, "vpmulld", mul, + SchedWritePMULLD, HasAVX512, 1>, T8PD; +defm VPMULLW : avx512_binop_rm_vl_w<0xD5, "vpmullw", mul, + SchedWriteVecIMul, HasBWI, 1>; +defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmullq", mul, + SchedWriteVecIMul, HasDQI, 1>, T8PD, + NotEVEX2VEXConvertible; +defm VPMULHW : avx512_binop_rm_vl_w<0xE5, "vpmulhw", mulhs, SchedWriteVecIMul, + HasBWI, 1>; +defm VPMULHUW : avx512_binop_rm_vl_w<0xE4, "vpmulhuw", mulhu, SchedWriteVecIMul, + HasBWI, 1>; +defm VPMULHRSW : avx512_binop_rm_vl_w<0x0B, "vpmulhrsw", X86mulhrs, + SchedWriteVecIMul, HasBWI, 1>, T8PD; +defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", X86avg, + SchedWriteVecALU, HasBWI, 1>; +defm VPMULDQ : avx512_binop_rm_vl_q<0x28, "vpmuldq", X86pmuldq, + SchedWriteVecIMul, HasAVX512, 1>, T8PD; +defm VPMULUDQ : avx512_binop_rm_vl_q<0xF4, "vpmuludq", X86pmuludq, + SchedWriteVecIMul, HasAVX512, 1>; + +multiclass avx512_binop_all<bits<8> opc, string OpcodeStr, + X86SchedWriteWidths sched, + AVX512VLVectorVTInfo _SrcVTInfo, + AVX512VLVectorVTInfo _DstVTInfo, + SDNode OpNode, Predicate prd, bit IsCommutable = 0> { + let Predicates = [prd] in + defm NAME#Z : avx512_binop_rm2<opc, OpcodeStr, sched.ZMM, OpNode, + _SrcVTInfo.info512, _DstVTInfo.info512, + v8i64_info, IsCommutable>, + EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_W; + let Predicates = [HasVLX, prd] in { + defm NAME#Z256 : avx512_binop_rm2<opc, OpcodeStr, sched.YMM, OpNode, + _SrcVTInfo.info256, _DstVTInfo.info256, + v4i64x_info, IsCommutable>, + EVEX_V256, EVEX_CD8<64, CD8VF>, VEX_W; + defm NAME#Z128 : avx512_binop_rm2<opc, OpcodeStr, sched.XMM, OpNode, + _SrcVTInfo.info128, _DstVTInfo.info128, + v2i64x_info, IsCommutable>, + EVEX_V128, EVEX_CD8<64, CD8VF>, VEX_W; + } +} + +defm VPMULTISHIFTQB : avx512_binop_all<0x83, "vpmultishiftqb", SchedWriteVecALU, + avx512vl_i8_info, avx512vl_i8_info, + X86multishift, HasVBMI, 0>, T8PD; + +multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _Src, X86VectorVTInfo _Dst, + X86FoldableSchedWrite sched> { + defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), + (ins _Src.RC:$src1, _Src.ScalarMemOp:$src2), + OpcodeStr, + "${src2}"##_Src.BroadcastStr##", $src1", + "$src1, ${src2}"##_Src.BroadcastStr, + (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert + (_Src.VT (X86VBroadcast + (_Src.ScalarLdFrag addr:$src2))))))>, + EVEX_4V, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>, + Sched<[sched.Folded, ReadAfterLd]>; +} + +multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr, + SDNode OpNode,X86VectorVTInfo _Src, + X86VectorVTInfo _Dst, X86FoldableSchedWrite sched, + bit IsCommutable = 0> { + defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst), + (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr, + "$src2, $src1","$src1, $src2", + (_Dst.VT (OpNode + (_Src.VT _Src.RC:$src1), + (_Src.VT _Src.RC:$src2))), + IsCommutable>, + EVEX_CD8<_Src.EltSize, CD8VF>, EVEX_4V, Sched<[sched]>; + defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), + (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), + (bitconvert (_Src.LdFrag addr:$src2))))>, + EVEX_4V, EVEX_CD8<_Src.EltSize, CD8VF>, + Sched<[sched.Folded, ReadAfterLd]>; +} + +multiclass avx512_packs_all_i32_i16<bits<8> opc, string OpcodeStr, + SDNode OpNode> { + let Predicates = [HasBWI] in + defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, v16i32_info, + v32i16_info, SchedWriteShuffle.ZMM>, + avx512_packs_rmb<opc, OpcodeStr, OpNode, v16i32_info, + v32i16_info, SchedWriteShuffle.ZMM>, EVEX_V512; + let Predicates = [HasBWI, HasVLX] in { + defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, v8i32x_info, + v16i16x_info, SchedWriteShuffle.YMM>, + avx512_packs_rmb<opc, OpcodeStr, OpNode, v8i32x_info, + v16i16x_info, SchedWriteShuffle.YMM>, + EVEX_V256; + defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, v4i32x_info, + v8i16x_info, SchedWriteShuffle.XMM>, + avx512_packs_rmb<opc, OpcodeStr, OpNode, v4i32x_info, + v8i16x_info, SchedWriteShuffle.XMM>, + EVEX_V128; + } +} +multiclass avx512_packs_all_i16_i8<bits<8> opc, string OpcodeStr, + SDNode OpNode> { + let Predicates = [HasBWI] in + defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, v32i16_info, v64i8_info, + SchedWriteShuffle.ZMM>, EVEX_V512, VEX_WIG; + let Predicates = [HasBWI, HasVLX] in { + defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, v16i16x_info, + v32i8x_info, SchedWriteShuffle.YMM>, + EVEX_V256, VEX_WIG; + defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, v8i16x_info, + v16i8x_info, SchedWriteShuffle.XMM>, + EVEX_V128, VEX_WIG; + } +} + +multiclass avx512_vpmadd<bits<8> opc, string OpcodeStr, + SDNode OpNode, AVX512VLVectorVTInfo _Src, + AVX512VLVectorVTInfo _Dst, bit IsCommutable = 0> { + let Predicates = [HasBWI] in + defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info512, + _Dst.info512, SchedWriteVecIMul.ZMM, + IsCommutable>, EVEX_V512; + let Predicates = [HasBWI, HasVLX] in { + defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info256, + _Dst.info256, SchedWriteVecIMul.YMM, + IsCommutable>, EVEX_V256; + defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info128, + _Dst.info128, SchedWriteVecIMul.XMM, + IsCommutable>, EVEX_V128; + } +} + +defm VPACKSSDW : avx512_packs_all_i32_i16<0x6B, "vpackssdw", X86Packss>, AVX512BIBase; +defm VPACKUSDW : avx512_packs_all_i32_i16<0x2b, "vpackusdw", X86Packus>, AVX5128IBase; +defm VPACKSSWB : avx512_packs_all_i16_i8 <0x63, "vpacksswb", X86Packss>, AVX512BIBase; +defm VPACKUSWB : avx512_packs_all_i16_i8 <0x67, "vpackuswb", X86Packus>, AVX512BIBase; + +defm VPMADDUBSW : avx512_vpmadd<0x04, "vpmaddubsw", X86vpmaddubsw, + avx512vl_i8_info, avx512vl_i16_info>, AVX512BIBase, T8PD, VEX_WIG; +defm VPMADDWD : avx512_vpmadd<0xF5, "vpmaddwd", X86vpmaddwd, + avx512vl_i16_info, avx512vl_i32_info, 1>, AVX512BIBase, VEX_WIG; + +defm VPMAXSB : avx512_binop_rm_vl_b<0x3C, "vpmaxsb", smax, + SchedWriteVecALU, HasBWI, 1>, T8PD; +defm VPMAXSW : avx512_binop_rm_vl_w<0xEE, "vpmaxsw", smax, + SchedWriteVecALU, HasBWI, 1>; +defm VPMAXSD : avx512_binop_rm_vl_d<0x3D, "vpmaxsd", smax, + SchedWriteVecALU, HasAVX512, 1>, T8PD; +defm VPMAXSQ : avx512_binop_rm_vl_q<0x3D, "vpmaxsq", smax, + SchedWriteVecALU, HasAVX512, 1>, T8PD, + NotEVEX2VEXConvertible; + +defm VPMAXUB : avx512_binop_rm_vl_b<0xDE, "vpmaxub", umax, + SchedWriteVecALU, HasBWI, 1>; +defm VPMAXUW : avx512_binop_rm_vl_w<0x3E, "vpmaxuw", umax, + SchedWriteVecALU, HasBWI, 1>, T8PD; +defm VPMAXUD : avx512_binop_rm_vl_d<0x3F, "vpmaxud", umax, + SchedWriteVecALU, HasAVX512, 1>, T8PD; +defm VPMAXUQ : avx512_binop_rm_vl_q<0x3F, "vpmaxuq", umax, + SchedWriteVecALU, HasAVX512, 1>, T8PD, + NotEVEX2VEXConvertible; + +defm VPMINSB : avx512_binop_rm_vl_b<0x38, "vpminsb", smin, + SchedWriteVecALU, HasBWI, 1>, T8PD; +defm VPMINSW : avx512_binop_rm_vl_w<0xEA, "vpminsw", smin, + SchedWriteVecALU, HasBWI, 1>; +defm VPMINSD : avx512_binop_rm_vl_d<0x39, "vpminsd", smin, + SchedWriteVecALU, HasAVX512, 1>, T8PD; +defm VPMINSQ : avx512_binop_rm_vl_q<0x39, "vpminsq", smin, + SchedWriteVecALU, HasAVX512, 1>, T8PD, + NotEVEX2VEXConvertible; + +defm VPMINUB : avx512_binop_rm_vl_b<0xDA, "vpminub", umin, + SchedWriteVecALU, HasBWI, 1>; +defm VPMINUW : avx512_binop_rm_vl_w<0x3A, "vpminuw", umin, + SchedWriteVecALU, HasBWI, 1>, T8PD; +defm VPMINUD : avx512_binop_rm_vl_d<0x3B, "vpminud", umin, + SchedWriteVecALU, HasAVX512, 1>, T8PD; +defm VPMINUQ : avx512_binop_rm_vl_q<0x3B, "vpminuq", umin, + SchedWriteVecALU, HasAVX512, 1>, T8PD, + NotEVEX2VEXConvertible; + +// PMULLQ: Use 512bit version to implement 128/256 bit in case NoVLX. +let Predicates = [HasDQI, NoVLX] in { + def : Pat<(v4i64 (mul (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))), + (EXTRACT_SUBREG + (VPMULLQZrr + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)), + sub_ymm)>; + + def : Pat<(v2i64 (mul (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))), + (EXTRACT_SUBREG + (VPMULLQZrr + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)), + sub_xmm)>; +} + +// PMULLQ: Use 512bit version to implement 128/256 bit in case NoVLX. +let Predicates = [HasDQI, NoVLX] in { + def : Pat<(v4i64 (mul (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))), + (EXTRACT_SUBREG + (VPMULLQZrr + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)), + sub_ymm)>; + + def : Pat<(v2i64 (mul (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))), + (EXTRACT_SUBREG + (VPMULLQZrr + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)), + sub_xmm)>; +} + +multiclass avx512_min_max_lowering<Instruction Instr, SDNode OpNode> { + def : Pat<(v4i64 (OpNode VR256X:$src1, VR256X:$src2)), + (EXTRACT_SUBREG + (Instr + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)), + sub_ymm)>; + + def : Pat<(v2i64 (OpNode VR128X:$src1, VR128X:$src2)), + (EXTRACT_SUBREG + (Instr + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)), + sub_xmm)>; +} + +let Predicates = [HasAVX512, NoVLX] in { + defm : avx512_min_max_lowering<VPMAXUQZrr, umax>; + defm : avx512_min_max_lowering<VPMINUQZrr, umin>; + defm : avx512_min_max_lowering<VPMAXSQZrr, smax>; + defm : avx512_min_max_lowering<VPMINSQZrr, smin>; +} + +//===----------------------------------------------------------------------===// +// AVX-512 Logical Instructions +//===----------------------------------------------------------------------===// + +// OpNodeMsk is the OpNode to use when element size is important. OpNode will +// be set to null_frag for 32-bit elements. +multiclass avx512_logic_rm<bits<8> opc, string OpcodeStr, + SDPatternOperator OpNode, + SDNode OpNodeMsk, X86FoldableSchedWrite sched, + X86VectorVTInfo _, bit IsCommutable = 0> { + let hasSideEffects = 0 in + defm rr : AVX512_maskable_logic<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.i64VT (OpNode (bitconvert (_.VT _.RC:$src1)), + (bitconvert (_.VT _.RC:$src2)))), + (_.VT (bitconvert (_.i64VT (OpNodeMsk _.RC:$src1, + _.RC:$src2)))), + IsCommutable>, AVX512BIBase, EVEX_4V, + Sched<[sched]>; + + let hasSideEffects = 0, mayLoad = 1 in + defm rm : AVX512_maskable_logic<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.i64VT (OpNode (bitconvert (_.VT _.RC:$src1)), + (bitconvert (_.LdFrag addr:$src2)))), + (_.VT (bitconvert (_.i64VT (OpNodeMsk _.RC:$src1, + (bitconvert (_.LdFrag addr:$src2))))))>, + AVX512BIBase, EVEX_4V, + Sched<[sched.Folded, ReadAfterLd]>; +} + +// OpNodeMsk is the OpNode to use where element size is important. So use +// for all of the broadcast patterns. +multiclass avx512_logic_rmb<bits<8> opc, string OpcodeStr, + SDPatternOperator OpNode, + SDNode OpNodeMsk, X86FoldableSchedWrite sched, X86VectorVTInfo _, + bit IsCommutable = 0> : + avx512_logic_rm<opc, OpcodeStr, OpNode, OpNodeMsk, sched, _, + IsCommutable> { + defm rmb : AVX512_maskable_logic<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, + "${src2}"##_.BroadcastStr##", $src1", + "$src1, ${src2}"##_.BroadcastStr, + (_.i64VT (OpNodeMsk _.RC:$src1, + (bitconvert + (_.VT (X86VBroadcast + (_.ScalarLdFrag addr:$src2)))))), + (_.VT (bitconvert (_.i64VT (OpNodeMsk _.RC:$src1, + (bitconvert + (_.VT (X86VBroadcast + (_.ScalarLdFrag addr:$src2))))))))>, + AVX512BIBase, EVEX_4V, EVEX_B, + Sched<[sched.Folded, ReadAfterLd]>; +} + +multiclass avx512_logic_rmb_vl<bits<8> opc, string OpcodeStr, + SDPatternOperator OpNode, + SDNode OpNodeMsk, X86SchedWriteWidths sched, + AVX512VLVectorVTInfo VTInfo, + bit IsCommutable = 0> { + let Predicates = [HasAVX512] in + defm Z : avx512_logic_rmb<opc, OpcodeStr, OpNode, OpNodeMsk, sched.ZMM, + VTInfo.info512, IsCommutable>, EVEX_V512; + + let Predicates = [HasAVX512, HasVLX] in { + defm Z256 : avx512_logic_rmb<opc, OpcodeStr, OpNode, OpNodeMsk, sched.YMM, + VTInfo.info256, IsCommutable>, EVEX_V256; + defm Z128 : avx512_logic_rmb<opc, OpcodeStr, OpNode, OpNodeMsk, sched.XMM, + VTInfo.info128, IsCommutable>, EVEX_V128; + } +} + +multiclass avx512_logic_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr, + SDNode OpNode, X86SchedWriteWidths sched, + bit IsCommutable = 0> { + defm Q : avx512_logic_rmb_vl<opc_q, OpcodeStr#"q", OpNode, OpNode, sched, + avx512vl_i64_info, IsCommutable>, + VEX_W, EVEX_CD8<64, CD8VF>; + defm D : avx512_logic_rmb_vl<opc_d, OpcodeStr#"d", null_frag, OpNode, sched, + avx512vl_i32_info, IsCommutable>, + EVEX_CD8<32, CD8VF>; +} + +defm VPAND : avx512_logic_rm_vl_dq<0xDB, 0xDB, "vpand", and, + SchedWriteVecLogic, 1>; +defm VPOR : avx512_logic_rm_vl_dq<0xEB, 0xEB, "vpor", or, + SchedWriteVecLogic, 1>; +defm VPXOR : avx512_logic_rm_vl_dq<0xEF, 0xEF, "vpxor", xor, + SchedWriteVecLogic, 1>; +defm VPANDN : avx512_logic_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp, + SchedWriteVecLogic>; + +//===----------------------------------------------------------------------===// +// AVX-512 FP arithmetic +//===----------------------------------------------------------------------===// + +multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, + SDNode OpNode, SDNode VecNode, + X86FoldableSchedWrite sched, bit IsCommutable> { + let ExeDomain = _.ExeDomain in { + defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (VecNode _.RC:$src1, _.RC:$src2, + (i32 FROUND_CURRENT)))>, + Sched<[sched]>; + + defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (VecNode _.RC:$src1, + _.ScalarIntMemCPat:$src2, + (i32 FROUND_CURRENT)))>, + Sched<[sched.Folded, ReadAfterLd]>; + let isCodeGenOnly = 1, Predicates = [HasAVX512] in { + def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst), + (ins _.FRC:$src1, _.FRC:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))]>, + Sched<[sched]> { + let isCommutable = IsCommutable; + } + def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst), + (ins _.FRC:$src1, _.ScalarMemOp:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set _.FRC:$dst, (OpNode _.FRC:$src1, + (_.ScalarLdFrag addr:$src2)))]>, + Sched<[sched.Folded, ReadAfterLd]>; + } + } +} + +multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, + SDNode VecNode, X86FoldableSchedWrite sched, + bit IsCommutable = 0> { + let ExeDomain = _.ExeDomain in + defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr, + "$rc, $src2, $src1", "$src1, $src2, $rc", + (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), + (i32 imm:$rc)), IsCommutable>, + EVEX_B, EVEX_RC, Sched<[sched]>; +} +multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, + SDNode OpNode, SDNode VecNode, SDNode SaeNode, + X86FoldableSchedWrite sched, bit IsCommutable> { + let ExeDomain = _.ExeDomain in { + defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (VecNode _.RC:$src1, _.RC:$src2))>, + Sched<[sched]>; + + defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (VecNode _.RC:$src1, + _.ScalarIntMemCPat:$src2))>, + Sched<[sched.Folded, ReadAfterLd]>; + + let isCodeGenOnly = 1, Predicates = [HasAVX512] in { + def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst), + (ins _.FRC:$src1, _.FRC:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))]>, + Sched<[sched]> { + let isCommutable = IsCommutable; + } + def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst), + (ins _.FRC:$src1, _.ScalarMemOp:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set _.FRC:$dst, (OpNode _.FRC:$src1, + (_.ScalarLdFrag addr:$src2)))]>, + Sched<[sched.Folded, ReadAfterLd]>; + } + + defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, + "{sae}, $src2, $src1", "$src1, $src2, {sae}", + (SaeNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), + (i32 FROUND_NO_EXC))>, EVEX_B, + Sched<[sched]>; + } +} + +multiclass avx512_binop_s_round<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode VecNode, X86SchedWriteSizes sched, + bit IsCommutable> { + defm SSZ : avx512_fp_scalar<opc, OpcodeStr#"ss", f32x_info, OpNode, VecNode, + sched.PS.Scl, IsCommutable>, + avx512_fp_scalar_round<opc, OpcodeStr#"ss", f32x_info, VecNode, + sched.PS.Scl, IsCommutable>, + XS, EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>; + defm SDZ : avx512_fp_scalar<opc, OpcodeStr#"sd", f64x_info, OpNode, VecNode, + sched.PD.Scl, IsCommutable>, + avx512_fp_scalar_round<opc, OpcodeStr#"sd", f64x_info, VecNode, + sched.PD.Scl, IsCommutable>, + XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>; +} + +multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode VecNode, SDNode SaeNode, + X86SchedWriteSizes sched, bit IsCommutable> { + defm SSZ : avx512_fp_scalar_sae<opc, OpcodeStr#"ss", f32x_info, OpNode, + VecNode, SaeNode, sched.PS.Scl, IsCommutable>, + XS, EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>; + defm SDZ : avx512_fp_scalar_sae<opc, OpcodeStr#"sd", f64x_info, OpNode, + VecNode, SaeNode, sched.PD.Scl, IsCommutable>, + XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>; +} +defm VADD : avx512_binop_s_round<0x58, "vadd", fadd, X86faddRnds, + SchedWriteFAddSizes, 1>; +defm VMUL : avx512_binop_s_round<0x59, "vmul", fmul, X86fmulRnds, + SchedWriteFMulSizes, 1>; +defm VSUB : avx512_binop_s_round<0x5C, "vsub", fsub, X86fsubRnds, + SchedWriteFAddSizes, 0>; +defm VDIV : avx512_binop_s_round<0x5E, "vdiv", fdiv, X86fdivRnds, + SchedWriteFDivSizes, 0>; +defm VMIN : avx512_binop_s_sae<0x5D, "vmin", X86fmin, X86fmins, X86fminRnds, + SchedWriteFCmpSizes, 0>; +defm VMAX : avx512_binop_s_sae<0x5F, "vmax", X86fmax, X86fmaxs, X86fmaxRnds, + SchedWriteFCmpSizes, 0>; + +// MIN/MAX nodes are commutable under "unsafe-fp-math". In this case we use +// X86fminc and X86fmaxc instead of X86fmin and X86fmax +multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr, + X86VectorVTInfo _, SDNode OpNode, + X86FoldableSchedWrite sched> { + let isCodeGenOnly = 1, Predicates = [HasAVX512], ExeDomain = _.ExeDomain in { + def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst), + (ins _.FRC:$src1, _.FRC:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))]>, + Sched<[sched]> { + let isCommutable = 1; + } + def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst), + (ins _.FRC:$src1, _.ScalarMemOp:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set _.FRC:$dst, (OpNode _.FRC:$src1, + (_.ScalarLdFrag addr:$src2)))]>, + Sched<[sched.Folded, ReadAfterLd]>; + } +} +defm VMINCSSZ : avx512_comutable_binop_s<0x5D, "vminss", f32x_info, X86fminc, + SchedWriteFCmp.Scl>, XS, EVEX_4V, + VEX_LIG, EVEX_CD8<32, CD8VT1>; + +defm VMINCSDZ : avx512_comutable_binop_s<0x5D, "vminsd", f64x_info, X86fminc, + SchedWriteFCmp.Scl>, XD, VEX_W, EVEX_4V, + VEX_LIG, EVEX_CD8<64, CD8VT1>; + +defm VMAXCSSZ : avx512_comutable_binop_s<0x5F, "vmaxss", f32x_info, X86fmaxc, + SchedWriteFCmp.Scl>, XS, EVEX_4V, + VEX_LIG, EVEX_CD8<32, CD8VT1>; + +defm VMAXCSDZ : avx512_comutable_binop_s<0x5F, "vmaxsd", f64x_info, X86fmaxc, + SchedWriteFCmp.Scl>, XD, VEX_W, EVEX_4V, + VEX_LIG, EVEX_CD8<64, CD8VT1>; + +multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, + X86VectorVTInfo _, X86FoldableSchedWrite sched, + bit IsCommutable, + bit IsKZCommutable = IsCommutable> { + let ExeDomain = _.ExeDomain, hasSideEffects = 0 in { + defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, _.RC:$src2)), IsCommutable, 0, + IsKZCommutable>, + EVEX_4V, Sched<[sched]>; + let mayLoad = 1 in { + defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix, + "$src2, $src1", "$src1, $src2", + (OpNode _.RC:$src1, (_.LdFrag addr:$src2))>, + EVEX_4V, Sched<[sched.Folded, ReadAfterLd]>; + defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix, + "${src2}"##_.BroadcastStr##", $src1", + "$src1, ${src2}"##_.BroadcastStr, + (OpNode _.RC:$src1, (_.VT (X86VBroadcast + (_.ScalarLdFrag addr:$src2))))>, + EVEX_4V, EVEX_B, + Sched<[sched.Folded, ReadAfterLd]>; + } + } +} + +multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr, + SDPatternOperator OpNodeRnd, + X86FoldableSchedWrite sched, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in + defm rrb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr##_.Suffix, + "$rc, $src2, $src1", "$src1, $src2, $rc", + (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 imm:$rc)))>, + EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched]>; +} + +multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr, + SDPatternOperator OpNodeRnd, + X86FoldableSchedWrite sched, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in + defm rrb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, + "{sae}, $src2, $src1", "$src1, $src2, {sae}", + (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 FROUND_NO_EXC)))>, + EVEX_4V, EVEX_B, Sched<[sched]>; +} + +multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, + Predicate prd, X86SchedWriteSizes sched, + bit IsCommutable = 0, + bit IsPD128Commutable = IsCommutable> { + let Predicates = [prd] in { + defm PSZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v16f32_info, + sched.PS.ZMM, IsCommutable>, EVEX_V512, PS, + EVEX_CD8<32, CD8VF>; + defm PDZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v8f64_info, + sched.PD.ZMM, IsCommutable>, EVEX_V512, PD, VEX_W, + EVEX_CD8<64, CD8VF>; + } + + // Define only if AVX512VL feature is present. + let Predicates = [prd, HasVLX] in { + defm PSZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, v4f32x_info, + sched.PS.XMM, IsCommutable>, EVEX_V128, PS, + EVEX_CD8<32, CD8VF>; + defm PSZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, v8f32x_info, + sched.PS.YMM, IsCommutable>, EVEX_V256, PS, + EVEX_CD8<32, CD8VF>; + defm PDZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, v2f64x_info, + sched.PD.XMM, IsPD128Commutable, + IsCommutable>, EVEX_V128, PD, VEX_W, + EVEX_CD8<64, CD8VF>; + defm PDZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, v4f64x_info, + sched.PD.YMM, IsCommutable>, EVEX_V256, PD, VEX_W, + EVEX_CD8<64, CD8VF>; + } +} + +multiclass avx512_fp_binop_p_round<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd, + X86SchedWriteSizes sched> { + defm PSZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, sched.PS.ZMM, + v16f32_info>, + EVEX_V512, PS, EVEX_CD8<32, CD8VF>; + defm PDZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, sched.PD.ZMM, + v8f64_info>, + EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>; +} + +multiclass avx512_fp_binop_p_sae<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd, + X86SchedWriteSizes sched> { + defm PSZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, sched.PS.ZMM, + v16f32_info>, + EVEX_V512, PS, EVEX_CD8<32, CD8VF>; + defm PDZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, sched.PD.ZMM, + v8f64_info>, + EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>; +} + +defm VADD : avx512_fp_binop_p<0x58, "vadd", fadd, HasAVX512, + SchedWriteFAddSizes, 1>, + avx512_fp_binop_p_round<0x58, "vadd", X86faddRnd, SchedWriteFAddSizes>; +defm VMUL : avx512_fp_binop_p<0x59, "vmul", fmul, HasAVX512, + SchedWriteFMulSizes, 1>, + avx512_fp_binop_p_round<0x59, "vmul", X86fmulRnd, SchedWriteFMulSizes>; +defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub, HasAVX512, + SchedWriteFAddSizes>, + avx512_fp_binop_p_round<0x5C, "vsub", X86fsubRnd, SchedWriteFAddSizes>; +defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv, HasAVX512, + SchedWriteFDivSizes>, + avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd, SchedWriteFDivSizes>; +defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, HasAVX512, + SchedWriteFCmpSizes, 0>, + avx512_fp_binop_p_sae<0x5D, "vmin", X86fminRnd, SchedWriteFCmpSizes>; +defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, HasAVX512, + SchedWriteFCmpSizes, 0>, + avx512_fp_binop_p_sae<0x5F, "vmax", X86fmaxRnd, SchedWriteFCmpSizes>; +let isCodeGenOnly = 1 in { + defm VMINC : avx512_fp_binop_p<0x5D, "vmin", X86fminc, HasAVX512, + SchedWriteFCmpSizes, 1>; + defm VMAXC : avx512_fp_binop_p<0x5F, "vmax", X86fmaxc, HasAVX512, + SchedWriteFCmpSizes, 1>; +} +defm VAND : avx512_fp_binop_p<0x54, "vand", null_frag, HasDQI, + SchedWriteFLogicSizes, 1>; +defm VANDN : avx512_fp_binop_p<0x55, "vandn", null_frag, HasDQI, + SchedWriteFLogicSizes, 0>; +defm VOR : avx512_fp_binop_p<0x56, "vor", null_frag, HasDQI, + SchedWriteFLogicSizes, 1>; +defm VXOR : avx512_fp_binop_p<0x57, "vxor", null_frag, HasDQI, + SchedWriteFLogicSizes, 1>; + +// Patterns catch floating point selects with bitcasted integer logic ops. +multiclass avx512_fp_logical_lowering<string InstrStr, SDNode OpNode, + X86VectorVTInfo _, Predicate prd> { +let Predicates = [prd] in { + // Masked register-register logical operations. + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (bitconvert (_.i64VT (OpNode _.RC:$src1, _.RC:$src2))), + _.RC:$src0)), + (!cast<Instruction>(InstrStr#rrk) _.RC:$src0, _.KRCWM:$mask, + _.RC:$src1, _.RC:$src2)>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (bitconvert (_.i64VT (OpNode _.RC:$src1, _.RC:$src2))), + _.ImmAllZerosV)), + (!cast<Instruction>(InstrStr#rrkz) _.KRCWM:$mask, _.RC:$src1, + _.RC:$src2)>; + // Masked register-memory logical operations. + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (bitconvert (_.i64VT (OpNode _.RC:$src1, + (load addr:$src2)))), + _.RC:$src0)), + (!cast<Instruction>(InstrStr#rmk) _.RC:$src0, _.KRCWM:$mask, + _.RC:$src1, addr:$src2)>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (bitconvert (_.i64VT (OpNode _.RC:$src1, (load addr:$src2)))), + _.ImmAllZerosV)), + (!cast<Instruction>(InstrStr#rmkz) _.KRCWM:$mask, _.RC:$src1, + addr:$src2)>; + // Register-broadcast logical operations. + def : Pat<(_.i64VT (OpNode _.RC:$src1, + (bitconvert (_.VT (X86VBroadcast + (_.ScalarLdFrag addr:$src2)))))), + (!cast<Instruction>(InstrStr#rmb) _.RC:$src1, addr:$src2)>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (bitconvert + (_.i64VT (OpNode _.RC:$src1, + (bitconvert (_.VT + (X86VBroadcast + (_.ScalarLdFrag addr:$src2))))))), + _.RC:$src0)), + (!cast<Instruction>(InstrStr#rmbk) _.RC:$src0, _.KRCWM:$mask, + _.RC:$src1, addr:$src2)>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (bitconvert + (_.i64VT (OpNode _.RC:$src1, + (bitconvert (_.VT + (X86VBroadcast + (_.ScalarLdFrag addr:$src2))))))), + _.ImmAllZerosV)), + (!cast<Instruction>(InstrStr#rmbkz) _.KRCWM:$mask, + _.RC:$src1, addr:$src2)>; +} +} + +multiclass avx512_fp_logical_lowering_sizes<string InstrStr, SDNode OpNode> { + defm : avx512_fp_logical_lowering<InstrStr#DZ128, OpNode, v4f32x_info, HasVLX>; + defm : avx512_fp_logical_lowering<InstrStr#QZ128, OpNode, v2f64x_info, HasVLX>; + defm : avx512_fp_logical_lowering<InstrStr#DZ256, OpNode, v8f32x_info, HasVLX>; + defm : avx512_fp_logical_lowering<InstrStr#QZ256, OpNode, v4f64x_info, HasVLX>; + defm : avx512_fp_logical_lowering<InstrStr#DZ, OpNode, v16f32_info, HasAVX512>; + defm : avx512_fp_logical_lowering<InstrStr#QZ, OpNode, v8f64_info, HasAVX512>; +} + +defm : avx512_fp_logical_lowering_sizes<"VPAND", and>; +defm : avx512_fp_logical_lowering_sizes<"VPOR", or>; +defm : avx512_fp_logical_lowering_sizes<"VPXOR", xor>; +defm : avx512_fp_logical_lowering_sizes<"VPANDN", X86andnp>; + +let Predicates = [HasVLX,HasDQI] in { + // Use packed logical operations for scalar ops. + def : Pat<(f64 (X86fand FR64X:$src1, FR64X:$src2)), + (COPY_TO_REGCLASS + (v2f64 (VANDPDZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)), + (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)))), + FR64X)>; + def : Pat<(f64 (X86for FR64X:$src1, FR64X:$src2)), + (COPY_TO_REGCLASS + (v2f64 (VORPDZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)), + (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)))), + FR64X)>; + def : Pat<(f64 (X86fxor FR64X:$src1, FR64X:$src2)), + (COPY_TO_REGCLASS + (v2f64 (VXORPDZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)), + (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)))), + FR64X)>; + def : Pat<(f64 (X86fandn FR64X:$src1, FR64X:$src2)), + (COPY_TO_REGCLASS + (v2f64 (VANDNPDZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)), + (v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)))), + FR64X)>; + + def : Pat<(f32 (X86fand FR32X:$src1, FR32X:$src2)), + (COPY_TO_REGCLASS + (v4f32 (VANDPSZ128rr (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)), + (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)))), + FR32X)>; + def : Pat<(f32 (X86for FR32X:$src1, FR32X:$src2)), + (COPY_TO_REGCLASS + (v4f32 (VORPSZ128rr (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)), + (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)))), + FR32X)>; + def : Pat<(f32 (X86fxor FR32X:$src1, FR32X:$src2)), + (COPY_TO_REGCLASS + (v4f32 (VXORPSZ128rr (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)), + (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)))), + FR32X)>; + def : Pat<(f32 (X86fandn FR32X:$src1, FR32X:$src2)), + (COPY_TO_REGCLASS + (v4f32 (VANDNPSZ128rr (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)), + (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)))), + FR32X)>; +} + +multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in { + defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT)))>, + EVEX_4V, Sched<[sched]>; + defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix, + "$src2, $src1", "$src1, $src2", + (OpNode _.RC:$src1, (_.LdFrag addr:$src2), (i32 FROUND_CURRENT))>, + EVEX_4V, Sched<[sched.Folded, ReadAfterLd]>; + defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix, + "${src2}"##_.BroadcastStr##", $src1", + "$src1, ${src2}"##_.BroadcastStr, + (OpNode _.RC:$src1, (_.VT (X86VBroadcast + (_.ScalarLdFrag addr:$src2))), + (i32 FROUND_CURRENT))>, + EVEX_4V, EVEX_B, Sched<[sched.Folded, ReadAfterLd]>; + } +} + +multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in { + defm rr: AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT)))>, + Sched<[sched]>; + defm rm: AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr##_.Suffix, + "$src2, $src1", "$src1, $src2", + (OpNode _.RC:$src1, _.ScalarIntMemCPat:$src2, + (i32 FROUND_CURRENT))>, + Sched<[sched.Folded, ReadAfterLd]>; + } +} + +multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr, + SDNode OpNode, SDNode OpNodeScal, + X86SchedWriteWidths sched> { + defm PSZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, sched.ZMM, v16f32_info>, + avx512_fp_round_packed<opc, OpcodeStr, OpNode, sched.ZMM, v16f32_info>, + EVEX_V512, EVEX_CD8<32, CD8VF>; + defm PDZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, sched.ZMM, v8f64_info>, + avx512_fp_round_packed<opc, OpcodeStr, OpNode, sched.ZMM, v8f64_info>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + defm SSZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNodeScal, sched.Scl, f32x_info>, + avx512_fp_scalar_round<opcScaler, OpcodeStr##"ss", f32x_info, OpNodeScal, sched.Scl>, + EVEX_4V,EVEX_CD8<32, CD8VT1>; + defm SDZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNodeScal, sched.Scl, f64x_info>, + avx512_fp_scalar_round<opcScaler, OpcodeStr##"sd", f64x_info, OpNodeScal, sched.Scl>, + EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W; + + // Define only if AVX512VL feature is present. + let Predicates = [HasVLX] in { + defm PSZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, sched.XMM, v4f32x_info>, + EVEX_V128, EVEX_CD8<32, CD8VF>; + defm PSZ256 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, sched.YMM, v8f32x_info>, + EVEX_V256, EVEX_CD8<32, CD8VF>; + defm PDZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, sched.XMM, v2f64x_info>, + EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>; + defm PDZ256 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, sched.YMM, v4f64x_info>, + EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>; + } +} +defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef", X86scalef, X86scalefs, + SchedWriteFAdd>, T8PD, NotEVEX2VEXConvertible; + +//===----------------------------------------------------------------------===// +// AVX-512 VPTESTM instructions +//===----------------------------------------------------------------------===// + +multiclass avx512_vptest<bits<8> opc, string OpcodeStr, PatFrag OpNode, + X86FoldableSchedWrite sched, X86VectorVTInfo _, + string Name> { + let ExeDomain = _.ExeDomain in { + let isCommutable = 1 in + defm rr : AVX512_maskable_cmp<opc, MRMSrcReg, _, (outs _.KRC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (OpNode (bitconvert (_.i64VT (and _.RC:$src1, _.RC:$src2))), + _.ImmAllZerosV)>, + EVEX_4V, Sched<[sched]>; + defm rm : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (OpNode (bitconvert + (_.i64VT (and _.RC:$src1, + (bitconvert (_.LdFrag addr:$src2))))), + _.ImmAllZerosV)>, + EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>, + Sched<[sched.Folded, ReadAfterLd]>; + } + + // Patterns for compare with 0 that just use the same source twice. + def : Pat<(_.KVT (OpNode _.RC:$src, _.ImmAllZerosV)), + (_.KVT (!cast<Instruction>(Name # _.ZSuffix # "rr") + _.RC:$src, _.RC:$src))>; + + def : Pat<(_.KVT (and _.KRC:$mask, (OpNode _.RC:$src, _.ImmAllZerosV))), + (_.KVT (!cast<Instruction>(Name # _.ZSuffix # "rrk") + _.KRC:$mask, _.RC:$src, _.RC:$src))>; +} + +multiclass avx512_vptest_mb<bits<8> opc, string OpcodeStr, PatFrag OpNode, + X86FoldableSchedWrite sched, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in + defm rmb : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, + "${src2}"##_.BroadcastStr##", $src1", + "$src1, ${src2}"##_.BroadcastStr, + (OpNode (and _.RC:$src1, + (X86VBroadcast + (_.ScalarLdFrag addr:$src2))), + _.ImmAllZerosV)>, + EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>, + Sched<[sched.Folded, ReadAfterLd]>; +} + +// Use 512bit version to implement 128/256 bit in case NoVLX. +multiclass avx512_vptest_lowering<PatFrag OpNode, X86VectorVTInfo ExtendInfo, + X86VectorVTInfo _, string Name> { + def : Pat<(_.KVT (OpNode (bitconvert (_.i64VT (and _.RC:$src1, _.RC:$src2))), + _.ImmAllZerosV)), + (_.KVT (COPY_TO_REGCLASS + (!cast<Instruction>(Name # "Zrr") + (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), + _.RC:$src1, _.SubRegIdx), + (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), + _.RC:$src2, _.SubRegIdx)), + _.KRC))>; + + def : Pat<(_.KVT (and _.KRC:$mask, + (OpNode (bitconvert (_.i64VT (and _.RC:$src1, _.RC:$src2))), + _.ImmAllZerosV))), + (COPY_TO_REGCLASS + (!cast<Instruction>(Name # "Zrrk") + (COPY_TO_REGCLASS _.KRC:$mask, ExtendInfo.KRC), + (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), + _.RC:$src1, _.SubRegIdx), + (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), + _.RC:$src2, _.SubRegIdx)), + _.KRC)>; + + def : Pat<(_.KVT (OpNode _.RC:$src, _.ImmAllZerosV)), + (_.KVT (COPY_TO_REGCLASS + (!cast<Instruction>(Name # "Zrr") + (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), + _.RC:$src, _.SubRegIdx), + (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), + _.RC:$src, _.SubRegIdx)), + _.KRC))>; + + def : Pat<(_.KVT (and _.KRC:$mask, (OpNode _.RC:$src, _.ImmAllZerosV))), + (COPY_TO_REGCLASS + (!cast<Instruction>(Name # "Zrrk") + (COPY_TO_REGCLASS _.KRC:$mask, ExtendInfo.KRC), + (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), + _.RC:$src, _.SubRegIdx), + (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), + _.RC:$src, _.SubRegIdx)), + _.KRC)>; +} + +multiclass avx512_vptest_dq_sizes<bits<8> opc, string OpcodeStr, PatFrag OpNode, + X86SchedWriteWidths sched, AVX512VLVectorVTInfo _> { + let Predicates = [HasAVX512] in + defm Z : avx512_vptest<opc, OpcodeStr, OpNode, sched.ZMM, _.info512, NAME>, + avx512_vptest_mb<opc, OpcodeStr, OpNode, sched.ZMM, _.info512>, EVEX_V512; + + let Predicates = [HasAVX512, HasVLX] in { + defm Z256 : avx512_vptest<opc, OpcodeStr, OpNode, sched.YMM, _.info256, NAME>, + avx512_vptest_mb<opc, OpcodeStr, OpNode, sched.YMM, _.info256>, EVEX_V256; + defm Z128 : avx512_vptest<opc, OpcodeStr, OpNode, sched.XMM, _.info128, NAME>, + avx512_vptest_mb<opc, OpcodeStr, OpNode, sched.XMM, _.info128>, EVEX_V128; + } + let Predicates = [HasAVX512, NoVLX] in { + defm Z256_Alt : avx512_vptest_lowering< OpNode, _.info512, _.info256, NAME>; + defm Z128_Alt : avx512_vptest_lowering< OpNode, _.info512, _.info128, NAME>; + } +} + +multiclass avx512_vptest_dq<bits<8> opc, string OpcodeStr, PatFrag OpNode, + X86SchedWriteWidths sched> { + defm D : avx512_vptest_dq_sizes<opc, OpcodeStr#"d", OpNode, sched, + avx512vl_i32_info>; + defm Q : avx512_vptest_dq_sizes<opc, OpcodeStr#"q", OpNode, sched, + avx512vl_i64_info>, VEX_W; +} + +multiclass avx512_vptest_wb<bits<8> opc, string OpcodeStr, + PatFrag OpNode, X86SchedWriteWidths sched> { + let Predicates = [HasBWI] in { + defm WZ: avx512_vptest<opc, OpcodeStr#"w", OpNode, sched.ZMM, + v32i16_info, NAME#"W">, EVEX_V512, VEX_W; + defm BZ: avx512_vptest<opc, OpcodeStr#"b", OpNode, sched.ZMM, + v64i8_info, NAME#"B">, EVEX_V512; + } + let Predicates = [HasVLX, HasBWI] in { + + defm WZ256: avx512_vptest<opc, OpcodeStr#"w", OpNode, sched.YMM, + v16i16x_info, NAME#"W">, EVEX_V256, VEX_W; + defm WZ128: avx512_vptest<opc, OpcodeStr#"w", OpNode, sched.XMM, + v8i16x_info, NAME#"W">, EVEX_V128, VEX_W; + defm BZ256: avx512_vptest<opc, OpcodeStr#"b", OpNode, sched.YMM, + v32i8x_info, NAME#"B">, EVEX_V256; + defm BZ128: avx512_vptest<opc, OpcodeStr#"b", OpNode, sched.XMM, + v16i8x_info, NAME#"B">, EVEX_V128; + } + + let Predicates = [HasAVX512, NoVLX] in { + defm BZ256_Alt : avx512_vptest_lowering<OpNode, v64i8_info, v32i8x_info, NAME#"B">; + defm BZ128_Alt : avx512_vptest_lowering<OpNode, v64i8_info, v16i8x_info, NAME#"B">; + defm WZ256_Alt : avx512_vptest_lowering<OpNode, v32i16_info, v16i16x_info, NAME#"W">; + defm WZ128_Alt : avx512_vptest_lowering<OpNode, v32i16_info, v8i16x_info, NAME#"W">; + } +} + +// These patterns are used to match vptestm/vptestnm. We don't treat pcmpeqm +// as commutable here because we already canonicalized all zeros vectors to the +// RHS during lowering. +def X86pcmpeqm : PatFrag<(ops node:$src1, node:$src2), + (setcc node:$src1, node:$src2, SETEQ)>; +def X86pcmpnem : PatFrag<(ops node:$src1, node:$src2), + (setcc node:$src1, node:$src2, SETNE)>; + +multiclass avx512_vptest_all_forms<bits<8> opc_wb, bits<8> opc_dq, string OpcodeStr, + PatFrag OpNode, X86SchedWriteWidths sched> : + avx512_vptest_wb<opc_wb, OpcodeStr, OpNode, sched>, + avx512_vptest_dq<opc_dq, OpcodeStr, OpNode, sched>; + +defm VPTESTM : avx512_vptest_all_forms<0x26, 0x27, "vptestm", X86pcmpnem, + SchedWriteVecLogic>, T8PD; +defm VPTESTNM : avx512_vptest_all_forms<0x26, 0x27, "vptestnm", X86pcmpeqm, + SchedWriteVecLogic>, T8XS; + +//===----------------------------------------------------------------------===// +// AVX-512 Shift instructions +//===----------------------------------------------------------------------===// + +multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM, + string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in { + defm ri : AVX512_maskable<opc, ImmFormR, _, (outs _.RC:$dst), + (ins _.RC:$src1, u8imm:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, (i8 imm:$src2)))>, + Sched<[sched]>; + defm mi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst), + (ins _.MemOp:$src1, u8imm:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), + (i8 imm:$src2)))>, + Sched<[sched.Folded]>; + } +} + +multiclass avx512_shift_rmbi<bits<8> opc, Format ImmFormM, + string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in + defm mbi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst), + (ins _.ScalarMemOp:$src1, u8imm:$src2), OpcodeStr, + "$src2, ${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr##", $src2", + (_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src1)), (i8 imm:$src2)))>, + EVEX_B, Sched<[sched.Folded]>; +} + +multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, ValueType SrcVT, + PatFrag bc_frag, X86VectorVTInfo _> { + // src2 is always 128-bit + let ExeDomain = _.ExeDomain in { + defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, VR128X:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, (SrcVT VR128X:$src2)))>, + AVX512BIBase, EVEX_4V, Sched<[sched]>; + defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, i128mem:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, (bc_frag (loadv2i64 addr:$src2))))>, + AVX512BIBase, + EVEX_4V, Sched<[sched.Folded, ReadAfterLd]>; + } +} + +multiclass avx512_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86SchedWriteWidths sched, ValueType SrcVT, + PatFrag bc_frag, AVX512VLVectorVTInfo VTInfo, + Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_shift_rrm<opc, OpcodeStr, OpNode, sched.ZMM, SrcVT, + bc_frag, VTInfo.info512>, EVEX_V512, + EVEX_CD8<VTInfo.info512.EltSize, CD8VQ> ; + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_shift_rrm<opc, OpcodeStr, OpNode, sched.YMM, SrcVT, + bc_frag, VTInfo.info256>, EVEX_V256, + EVEX_CD8<VTInfo.info256.EltSize, CD8VH>; + defm Z128 : avx512_shift_rrm<opc, OpcodeStr, OpNode, sched.XMM, SrcVT, + bc_frag, VTInfo.info128>, EVEX_V128, + EVEX_CD8<VTInfo.info128.EltSize, CD8VF>; + } +} + +multiclass avx512_shift_types<bits<8> opcd, bits<8> opcq, bits<8> opcw, + string OpcodeStr, SDNode OpNode, + X86SchedWriteWidths sched, + bit NotEVEX2VEXConvertibleQ = 0> { + defm D : avx512_shift_sizes<opcd, OpcodeStr#"d", OpNode, sched, v4i32, + bc_v4i32, avx512vl_i32_info, HasAVX512>; + let notEVEX2VEXConvertible = NotEVEX2VEXConvertibleQ in + defm Q : avx512_shift_sizes<opcq, OpcodeStr#"q", OpNode, sched, v2i64, + bc_v2i64, avx512vl_i64_info, HasAVX512>, VEX_W; + defm W : avx512_shift_sizes<opcw, OpcodeStr#"w", OpNode, sched, v8i16, + bc_v2i64, avx512vl_i16_info, HasBWI>; +} + +multiclass avx512_shift_rmi_sizes<bits<8> opc, Format ImmFormR, Format ImmFormM, + string OpcodeStr, SDNode OpNode, + X86SchedWriteWidths sched, + AVX512VLVectorVTInfo VTInfo> { + let Predicates = [HasAVX512] in + defm Z: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode, + sched.ZMM, VTInfo.info512>, + avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, sched.ZMM, + VTInfo.info512>, EVEX_V512; + let Predicates = [HasAVX512, HasVLX] in { + defm Z256: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode, + sched.YMM, VTInfo.info256>, + avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, sched.YMM, + VTInfo.info256>, EVEX_V256; + defm Z128: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode, + sched.XMM, VTInfo.info128>, + avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, sched.XMM, + VTInfo.info128>, EVEX_V128; + } +} + +multiclass avx512_shift_rmi_w<bits<8> opcw, Format ImmFormR, Format ImmFormM, + string OpcodeStr, SDNode OpNode, + X86SchedWriteWidths sched> { + let Predicates = [HasBWI] in + defm WZ: avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode, + sched.ZMM, v32i16_info>, EVEX_V512, VEX_WIG; + let Predicates = [HasVLX, HasBWI] in { + defm WZ256: avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode, + sched.YMM, v16i16x_info>, EVEX_V256, VEX_WIG; + defm WZ128: avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode, + sched.XMM, v8i16x_info>, EVEX_V128, VEX_WIG; + } +} + +multiclass avx512_shift_rmi_dq<bits<8> opcd, bits<8> opcq, + Format ImmFormR, Format ImmFormM, + string OpcodeStr, SDNode OpNode, + X86SchedWriteWidths sched, + bit NotEVEX2VEXConvertibleQ = 0> { + defm D: avx512_shift_rmi_sizes<opcd, ImmFormR, ImmFormM, OpcodeStr#"d", OpNode, + sched, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; + let notEVEX2VEXConvertible = NotEVEX2VEXConvertibleQ in + defm Q: avx512_shift_rmi_sizes<opcq, ImmFormR, ImmFormM, OpcodeStr#"q", OpNode, + sched, avx512vl_i64_info>, EVEX_CD8<64, CD8VF>, VEX_W; +} + +defm VPSRL : avx512_shift_rmi_dq<0x72, 0x73, MRM2r, MRM2m, "vpsrl", X86vsrli, + SchedWriteVecShiftImm>, + avx512_shift_rmi_w<0x71, MRM2r, MRM2m, "vpsrlw", X86vsrli, + SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V; + +defm VPSLL : avx512_shift_rmi_dq<0x72, 0x73, MRM6r, MRM6m, "vpsll", X86vshli, + SchedWriteVecShiftImm>, + avx512_shift_rmi_w<0x71, MRM6r, MRM6m, "vpsllw", X86vshli, + SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V; + +defm VPSRA : avx512_shift_rmi_dq<0x72, 0x72, MRM4r, MRM4m, "vpsra", X86vsrai, + SchedWriteVecShiftImm, 1>, + avx512_shift_rmi_w<0x71, MRM4r, MRM4m, "vpsraw", X86vsrai, + SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V; + +defm VPROR : avx512_shift_rmi_dq<0x72, 0x72, MRM0r, MRM0m, "vpror", X86vrotri, + SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V; +defm VPROL : avx512_shift_rmi_dq<0x72, 0x72, MRM1r, MRM1m, "vprol", X86vrotli, + SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V; + +defm VPSLL : avx512_shift_types<0xF2, 0xF3, 0xF1, "vpsll", X86vshl, + SchedWriteVecShift>; +defm VPSRA : avx512_shift_types<0xE2, 0xE2, 0xE1, "vpsra", X86vsra, + SchedWriteVecShift, 1>; +defm VPSRL : avx512_shift_types<0xD2, 0xD3, 0xD1, "vpsrl", X86vsrl, + SchedWriteVecShift>; + +// Use 512bit VPSRA/VPSRAI version to implement v2i64/v4i64 in case NoVLX. +let Predicates = [HasAVX512, NoVLX] in { + def : Pat<(v4i64 (X86vsra (v4i64 VR256X:$src1), (v2i64 VR128X:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPSRAQZrr + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + VR128X:$src2)), sub_ymm)>; + + def : Pat<(v2i64 (X86vsra (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPSRAQZrr + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), + VR128X:$src2)), sub_xmm)>; + + def : Pat<(v4i64 (X86vsrai (v4i64 VR256X:$src1), (i8 imm:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPSRAQZri + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + imm:$src2)), sub_ymm)>; + + def : Pat<(v2i64 (X86vsrai (v2i64 VR128X:$src1), (i8 imm:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPSRAQZri + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), + imm:$src2)), sub_xmm)>; +} + +//===-------------------------------------------------------------------===// +// Variable Bit Shifts +//===-------------------------------------------------------------------===// + +multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in { + defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, (_.VT _.RC:$src2)))>, + AVX5128IBase, EVEX_4V, Sched<[sched]>; + defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, + (_.VT (bitconvert (_.LdFrag addr:$src2)))))>, + AVX5128IBase, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>, + Sched<[sched.Folded, ReadAfterLd]>; + } +} + +multiclass avx512_var_shift_mb<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in + defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, + "${src2}"##_.BroadcastStr##", $src1", + "$src1, ${src2}"##_.BroadcastStr, + (_.VT (OpNode _.RC:$src1, (_.VT (X86VBroadcast + (_.ScalarLdFrag addr:$src2)))))>, + AVX5128IBase, EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>, + Sched<[sched.Folded, ReadAfterLd]>; +} + +multiclass avx512_var_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86SchedWriteWidths sched, AVX512VLVectorVTInfo _> { + let Predicates = [HasAVX512] in + defm Z : avx512_var_shift<opc, OpcodeStr, OpNode, sched.ZMM, _.info512>, + avx512_var_shift_mb<opc, OpcodeStr, OpNode, sched.ZMM, _.info512>, EVEX_V512; + + let Predicates = [HasAVX512, HasVLX] in { + defm Z256 : avx512_var_shift<opc, OpcodeStr, OpNode, sched.YMM, _.info256>, + avx512_var_shift_mb<opc, OpcodeStr, OpNode, sched.YMM, _.info256>, EVEX_V256; + defm Z128 : avx512_var_shift<opc, OpcodeStr, OpNode, sched.XMM, _.info128>, + avx512_var_shift_mb<opc, OpcodeStr, OpNode, sched.XMM, _.info128>, EVEX_V128; + } +} + +multiclass avx512_var_shift_types<bits<8> opc, string OpcodeStr, + SDNode OpNode, X86SchedWriteWidths sched> { + defm D : avx512_var_shift_sizes<opc, OpcodeStr#"d", OpNode, sched, + avx512vl_i32_info>; + defm Q : avx512_var_shift_sizes<opc, OpcodeStr#"q", OpNode, sched, + avx512vl_i64_info>, VEX_W; +} + +// Use 512bit version to implement 128/256 bit in case NoVLX. +multiclass avx512_var_shift_lowering<AVX512VLVectorVTInfo _, string OpcodeStr, + SDNode OpNode, list<Predicate> p> { + let Predicates = p in { + def : Pat<(_.info256.VT (OpNode (_.info256.VT _.info256.RC:$src1), + (_.info256.VT _.info256.RC:$src2))), + (EXTRACT_SUBREG + (!cast<Instruction>(OpcodeStr#"Zrr") + (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src1, sub_ymm), + (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)), + sub_ymm)>; + + def : Pat<(_.info128.VT (OpNode (_.info128.VT _.info128.RC:$src1), + (_.info128.VT _.info128.RC:$src2))), + (EXTRACT_SUBREG + (!cast<Instruction>(OpcodeStr#"Zrr") + (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src1, sub_xmm), + (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)), + sub_xmm)>; + } +} +multiclass avx512_var_shift_w<bits<8> opc, string OpcodeStr, + SDNode OpNode, X86SchedWriteWidths sched> { + let Predicates = [HasBWI] in + defm WZ: avx512_var_shift<opc, OpcodeStr, OpNode, sched.ZMM, v32i16_info>, + EVEX_V512, VEX_W; + let Predicates = [HasVLX, HasBWI] in { + + defm WZ256: avx512_var_shift<opc, OpcodeStr, OpNode, sched.YMM, v16i16x_info>, + EVEX_V256, VEX_W; + defm WZ128: avx512_var_shift<opc, OpcodeStr, OpNode, sched.XMM, v8i16x_info>, + EVEX_V128, VEX_W; + } +} + +defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl, SchedWriteVarVecShift>, + avx512_var_shift_w<0x12, "vpsllvw", shl, SchedWriteVarVecShift>; + +defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra, SchedWriteVarVecShift>, + avx512_var_shift_w<0x11, "vpsravw", sra, SchedWriteVarVecShift>; + +defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl, SchedWriteVarVecShift>, + avx512_var_shift_w<0x10, "vpsrlvw", srl, SchedWriteVarVecShift>; + +defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr, SchedWriteVarVecShift>; +defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl, SchedWriteVarVecShift>; + +defm : avx512_var_shift_lowering<avx512vl_i64_info, "VPSRAVQ", sra, [HasAVX512, NoVLX]>; +defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSLLVW", shl, [HasBWI, NoVLX]>; +defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRAVW", sra, [HasBWI, NoVLX]>; +defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRLVW", srl, [HasBWI, NoVLX]>; + +// Special handing for handling VPSRAV intrinsics. +multiclass avx512_var_shift_int_lowering<string InstrStr, X86VectorVTInfo _, + list<Predicate> p> { + let Predicates = p in { + def : Pat<(_.VT (X86vsrav _.RC:$src1, _.RC:$src2)), + (!cast<Instruction>(InstrStr#_.ZSuffix#rr) _.RC:$src1, + _.RC:$src2)>; + def : Pat<(_.VT (X86vsrav _.RC:$src1, (bitconvert (_.LdFrag addr:$src2)))), + (!cast<Instruction>(InstrStr#_.ZSuffix##rm) + _.RC:$src1, addr:$src2)>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (X86vsrav _.RC:$src1, _.RC:$src2), _.RC:$src0)), + (!cast<Instruction>(InstrStr#_.ZSuffix#rrk) _.RC:$src0, + _.KRC:$mask, _.RC:$src1, _.RC:$src2)>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (X86vsrav _.RC:$src1, (bitconvert (_.LdFrag addr:$src2))), + _.RC:$src0)), + (!cast<Instruction>(InstrStr#_.ZSuffix##rmk) _.RC:$src0, + _.KRC:$mask, _.RC:$src1, addr:$src2)>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (X86vsrav _.RC:$src1, _.RC:$src2), _.ImmAllZerosV)), + (!cast<Instruction>(InstrStr#_.ZSuffix#rrkz) _.KRC:$mask, + _.RC:$src1, _.RC:$src2)>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (X86vsrav _.RC:$src1, (bitconvert (_.LdFrag addr:$src2))), + _.ImmAllZerosV)), + (!cast<Instruction>(InstrStr#_.ZSuffix##rmkz) _.KRC:$mask, + _.RC:$src1, addr:$src2)>; + } +} + +multiclass avx512_var_shift_int_lowering_mb<string InstrStr, X86VectorVTInfo _, + list<Predicate> p> : + avx512_var_shift_int_lowering<InstrStr, _, p> { + let Predicates = p in { + def : Pat<(_.VT (X86vsrav _.RC:$src1, + (X86VBroadcast (_.ScalarLdFrag addr:$src2)))), + (!cast<Instruction>(InstrStr#_.ZSuffix##rmb) + _.RC:$src1, addr:$src2)>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (X86vsrav _.RC:$src1, + (X86VBroadcast (_.ScalarLdFrag addr:$src2))), + _.RC:$src0)), + (!cast<Instruction>(InstrStr#_.ZSuffix##rmbk) _.RC:$src0, + _.KRC:$mask, _.RC:$src1, addr:$src2)>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (X86vsrav _.RC:$src1, + (X86VBroadcast (_.ScalarLdFrag addr:$src2))), + _.ImmAllZerosV)), + (!cast<Instruction>(InstrStr#_.ZSuffix##rmbkz) _.KRC:$mask, + _.RC:$src1, addr:$src2)>; + } +} + +defm : avx512_var_shift_int_lowering<"VPSRAVW", v8i16x_info, [HasVLX, HasBWI]>; +defm : avx512_var_shift_int_lowering<"VPSRAVW", v16i16x_info, [HasVLX, HasBWI]>; +defm : avx512_var_shift_int_lowering<"VPSRAVW", v32i16_info, [HasBWI]>; +defm : avx512_var_shift_int_lowering_mb<"VPSRAVD", v4i32x_info, [HasVLX]>; +defm : avx512_var_shift_int_lowering_mb<"VPSRAVD", v8i32x_info, [HasVLX]>; +defm : avx512_var_shift_int_lowering_mb<"VPSRAVD", v16i32_info, [HasAVX512]>; +defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v2i64x_info, [HasVLX]>; +defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v4i64x_info, [HasVLX]>; +defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v8i64_info, [HasAVX512]>; + +// Use 512bit VPROL/VPROLI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX. +let Predicates = [HasAVX512, NoVLX] in { + def : Pat<(v2i64 (rotl (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPROLVQZrr + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm)))), + sub_xmm)>; + def : Pat<(v4i64 (rotl (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPROLVQZrr + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)))), + sub_ymm)>; + + def : Pat<(v4i32 (rotl (v4i32 VR128X:$src1), (v4i32 VR128X:$src2))), + (EXTRACT_SUBREG (v16i32 + (VPROLVDZrr + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm)))), + sub_xmm)>; + def : Pat<(v8i32 (rotl (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), + (EXTRACT_SUBREG (v16i32 + (VPROLVDZrr + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)))), + sub_ymm)>; + + def : Pat<(v2i64 (X86vrotli (v2i64 VR128X:$src1), (i8 imm:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPROLQZri + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), + imm:$src2)), sub_xmm)>; + def : Pat<(v4i64 (X86vrotli (v4i64 VR256X:$src1), (i8 imm:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPROLQZri + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + imm:$src2)), sub_ymm)>; + + def : Pat<(v4i32 (X86vrotli (v4i32 VR128X:$src1), (i8 imm:$src2))), + (EXTRACT_SUBREG (v16i32 + (VPROLDZri + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), + imm:$src2)), sub_xmm)>; + def : Pat<(v8i32 (X86vrotli (v8i32 VR256X:$src1), (i8 imm:$src2))), + (EXTRACT_SUBREG (v16i32 + (VPROLDZri + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + imm:$src2)), sub_ymm)>; +} + +// Use 512bit VPROR/VPRORI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX. +let Predicates = [HasAVX512, NoVLX] in { + def : Pat<(v2i64 (rotr (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPRORVQZrr + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm)))), + sub_xmm)>; + def : Pat<(v4i64 (rotr (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPRORVQZrr + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)))), + sub_ymm)>; + + def : Pat<(v4i32 (rotr (v4i32 VR128X:$src1), (v4i32 VR128X:$src2))), + (EXTRACT_SUBREG (v16i32 + (VPRORVDZrr + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm)))), + sub_xmm)>; + def : Pat<(v8i32 (rotr (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), + (EXTRACT_SUBREG (v16i32 + (VPRORVDZrr + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)))), + sub_ymm)>; + + def : Pat<(v2i64 (X86vrotri (v2i64 VR128X:$src1), (i8 imm:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPRORQZri + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), + imm:$src2)), sub_xmm)>; + def : Pat<(v4i64 (X86vrotri (v4i64 VR256X:$src1), (i8 imm:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPRORQZri + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + imm:$src2)), sub_ymm)>; + + def : Pat<(v4i32 (X86vrotri (v4i32 VR128X:$src1), (i8 imm:$src2))), + (EXTRACT_SUBREG (v16i32 + (VPRORDZri + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), + imm:$src2)), sub_xmm)>; + def : Pat<(v8i32 (X86vrotri (v8i32 VR256X:$src1), (i8 imm:$src2))), + (EXTRACT_SUBREG (v16i32 + (VPRORDZri + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + imm:$src2)), sub_ymm)>; +} + +//===-------------------------------------------------------------------===// +// 1-src variable permutation VPERMW/D/Q +//===-------------------------------------------------------------------===// + +multiclass avx512_vperm_dq_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, AVX512VLVectorVTInfo _> { + let Predicates = [HasAVX512] in + defm Z : avx512_var_shift<opc, OpcodeStr, OpNode, sched, _.info512>, + avx512_var_shift_mb<opc, OpcodeStr, OpNode, sched, _.info512>, EVEX_V512; + + let Predicates = [HasAVX512, HasVLX] in + defm Z256 : avx512_var_shift<opc, OpcodeStr, OpNode, sched, _.info256>, + avx512_var_shift_mb<opc, OpcodeStr, OpNode, sched, _.info256>, EVEX_V256; +} + +multiclass avx512_vpermi_dq_sizes<bits<8> opc, Format ImmFormR, Format ImmFormM, + string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, AVX512VLVectorVTInfo VTInfo> { + let Predicates = [HasAVX512] in + defm Z: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode, + sched, VTInfo.info512>, + avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, + sched, VTInfo.info512>, EVEX_V512; + let Predicates = [HasAVX512, HasVLX] in + defm Z256: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode, + sched, VTInfo.info256>, + avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, + sched, VTInfo.info256>, EVEX_V256; +} + +multiclass avx512_vperm_bw<bits<8> opc, string OpcodeStr, + Predicate prd, SDNode OpNode, + X86FoldableSchedWrite sched, AVX512VLVectorVTInfo _> { + let Predicates = [prd] in + defm Z: avx512_var_shift<opc, OpcodeStr, OpNode, sched, _.info512>, + EVEX_V512 ; + let Predicates = [HasVLX, prd] in { + defm Z256: avx512_var_shift<opc, OpcodeStr, OpNode, sched, _.info256>, + EVEX_V256 ; + defm Z128: avx512_var_shift<opc, OpcodeStr, OpNode, sched, _.info128>, + EVEX_V128 ; + } +} + +defm VPERMW : avx512_vperm_bw<0x8D, "vpermw", HasBWI, X86VPermv, + WriteVarShuffle256, avx512vl_i16_info>, VEX_W; +defm VPERMB : avx512_vperm_bw<0x8D, "vpermb", HasVBMI, X86VPermv, + WriteVarShuffle256, avx512vl_i8_info>; + +defm VPERMD : avx512_vperm_dq_sizes<0x36, "vpermd", X86VPermv, + WriteVarShuffle256, avx512vl_i32_info>; +defm VPERMQ : avx512_vperm_dq_sizes<0x36, "vpermq", X86VPermv, + WriteVarShuffle256, avx512vl_i64_info>, VEX_W; +defm VPERMPS : avx512_vperm_dq_sizes<0x16, "vpermps", X86VPermv, + WriteFVarShuffle256, avx512vl_f32_info>; +defm VPERMPD : avx512_vperm_dq_sizes<0x16, "vpermpd", X86VPermv, + WriteFVarShuffle256, avx512vl_f64_info>, VEX_W; + +defm VPERMQ : avx512_vpermi_dq_sizes<0x00, MRMSrcReg, MRMSrcMem, "vpermq", + X86VPermi, WriteShuffle256, avx512vl_i64_info>, + EVEX, AVX512AIi8Base, EVEX_CD8<64, CD8VF>, VEX_W; +defm VPERMPD : avx512_vpermi_dq_sizes<0x01, MRMSrcReg, MRMSrcMem, "vpermpd", + X86VPermi, WriteFShuffle256, avx512vl_f64_info>, + EVEX, AVX512AIi8Base, EVEX_CD8<64, CD8VF>, VEX_W; + +//===----------------------------------------------------------------------===// +// AVX-512 - VPERMIL +//===----------------------------------------------------------------------===// + +multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, X86VectorVTInfo _, + X86VectorVTInfo Ctrl> { + defm rr: AVX512_maskable<OpcVar, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, Ctrl.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, + (Ctrl.VT Ctrl.RC:$src2)))>, + T8PD, EVEX_4V, Sched<[sched]>; + defm rm: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, Ctrl.MemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode + _.RC:$src1, + (Ctrl.VT (bitconvert(Ctrl.LdFrag addr:$src2)))))>, + T8PD, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>, + Sched<[sched.Folded, ReadAfterLd]>; + defm rmb: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, + "${src2}"##_.BroadcastStr##", $src1", + "$src1, ${src2}"##_.BroadcastStr, + (_.VT (OpNode + _.RC:$src1, + (Ctrl.VT (X86VBroadcast + (Ctrl.ScalarLdFrag addr:$src2)))))>, + T8PD, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>, + Sched<[sched.Folded, ReadAfterLd]>; +} + +multiclass avx512_permil_vec_common<string OpcodeStr, bits<8> OpcVar, + X86SchedWriteWidths sched, + AVX512VLVectorVTInfo _, + AVX512VLVectorVTInfo Ctrl> { + let Predicates = [HasAVX512] in { + defm Z : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, sched.ZMM, + _.info512, Ctrl.info512>, EVEX_V512; + } + let Predicates = [HasAVX512, HasVLX] in { + defm Z128 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, sched.XMM, + _.info128, Ctrl.info128>, EVEX_V128; + defm Z256 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, sched.YMM, + _.info256, Ctrl.info256>, EVEX_V256; + } +} + +multiclass avx512_permil<string OpcodeStr, bits<8> OpcImm, bits<8> OpcVar, + AVX512VLVectorVTInfo _, AVX512VLVectorVTInfo Ctrl>{ + defm NAME: avx512_permil_vec_common<OpcodeStr, OpcVar, SchedWriteFVarShuffle, + _, Ctrl>; + defm NAME: avx512_shift_rmi_sizes<OpcImm, MRMSrcReg, MRMSrcMem, OpcodeStr, + X86VPermilpi, SchedWriteFShuffle, _>, + EVEX, AVX512AIi8Base, EVEX_CD8<_.info128.EltSize, CD8VF>; +} + +let ExeDomain = SSEPackedSingle in +defm VPERMILPS : avx512_permil<"vpermilps", 0x04, 0x0C, avx512vl_f32_info, + avx512vl_i32_info>; +let ExeDomain = SSEPackedDouble in +defm VPERMILPD : avx512_permil<"vpermilpd", 0x05, 0x0D, avx512vl_f64_info, + avx512vl_i64_info>, VEX_W1X; + +//===----------------------------------------------------------------------===// +// AVX-512 - VPSHUFD, VPSHUFLW, VPSHUFHW +//===----------------------------------------------------------------------===// + +defm VPSHUFD : avx512_shift_rmi_sizes<0x70, MRMSrcReg, MRMSrcMem, "vpshufd", + X86PShufd, SchedWriteShuffle, avx512vl_i32_info>, + EVEX, AVX512BIi8Base, EVEX_CD8<32, CD8VF>; +defm VPSHUFH : avx512_shift_rmi_w<0x70, MRMSrcReg, MRMSrcMem, "vpshufhw", + X86PShufhw, SchedWriteShuffle>, + EVEX, AVX512XSIi8Base; +defm VPSHUFL : avx512_shift_rmi_w<0x70, MRMSrcReg, MRMSrcMem, "vpshuflw", + X86PShuflw, SchedWriteShuffle>, + EVEX, AVX512XDIi8Base; + +//===----------------------------------------------------------------------===// +// AVX-512 - VPSHUFB +//===----------------------------------------------------------------------===// + +multiclass avx512_pshufb_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86SchedWriteWidths sched> { + let Predicates = [HasBWI] in + defm Z: avx512_var_shift<opc, OpcodeStr, OpNode, sched.ZMM, v64i8_info>, + EVEX_V512; + + let Predicates = [HasVLX, HasBWI] in { + defm Z256: avx512_var_shift<opc, OpcodeStr, OpNode, sched.YMM, v32i8x_info>, + EVEX_V256; + defm Z128: avx512_var_shift<opc, OpcodeStr, OpNode, sched.XMM, v16i8x_info>, + EVEX_V128; + } +} + +defm VPSHUFB: avx512_pshufb_sizes<0x00, "vpshufb", X86pshufb, + SchedWriteVarShuffle>, VEX_WIG; + +//===----------------------------------------------------------------------===// +// Move Low to High and High to Low packed FP Instructions +//===----------------------------------------------------------------------===// + +def VMOVLHPSZrr : AVX512PSI<0x16, MRMSrcReg, (outs VR128X:$dst), + (ins VR128X:$src1, VR128X:$src2), + "vmovlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128X:$dst, (v4f32 (X86Movlhps VR128X:$src1, VR128X:$src2)))]>, + Sched<[SchedWriteFShuffle.XMM]>, EVEX_4V; +let isCommutable = 1 in +def VMOVHLPSZrr : AVX512PSI<0x12, MRMSrcReg, (outs VR128X:$dst), + (ins VR128X:$src1, VR128X:$src2), + "vmovhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128X:$dst, (v4f32 (X86Movhlps VR128X:$src1, VR128X:$src2)))]>, + Sched<[SchedWriteFShuffle.XMM]>, EVEX_4V, NotMemoryFoldable; + +//===----------------------------------------------------------------------===// +// VMOVHPS/PD VMOVLPS Instructions +// All patterns was taken from SSS implementation. +//===----------------------------------------------------------------------===// + +multiclass avx512_mov_hilo_packed<bits<8> opc, string OpcodeStr, + SDPatternOperator OpNode, + X86VectorVTInfo _> { + let hasSideEffects = 0, mayLoad = 1, ExeDomain = _.ExeDomain in + def rm : AVX512<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.RC:$src1, f64mem:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.RC:$dst, + (OpNode _.RC:$src1, + (_.VT (bitconvert + (v2f64 (scalar_to_vector (loadf64 addr:$src2)))))))]>, + Sched<[SchedWriteFShuffle.XMM.Folded, ReadAfterLd]>, EVEX_4V; +} + +// No patterns for MOVLPS/MOVHPS as the Movlhps node should only be created in +// SSE1. And MOVLPS pattern is even more complex. +defm VMOVHPSZ128 : avx512_mov_hilo_packed<0x16, "vmovhps", null_frag, + v4f32x_info>, EVEX_CD8<32, CD8VT2>, PS; +defm VMOVHPDZ128 : avx512_mov_hilo_packed<0x16, "vmovhpd", X86Unpckl, + v2f64x_info>, EVEX_CD8<64, CD8VT1>, PD, VEX_W; +defm VMOVLPSZ128 : avx512_mov_hilo_packed<0x12, "vmovlps", null_frag, + v4f32x_info>, EVEX_CD8<32, CD8VT2>, PS; +defm VMOVLPDZ128 : avx512_mov_hilo_packed<0x12, "vmovlpd", X86Movsd, + v2f64x_info>, EVEX_CD8<64, CD8VT1>, PD, VEX_W; + +let Predicates = [HasAVX512] in { + // VMOVHPD patterns + def : Pat<(v2f64 (X86Unpckl VR128X:$src1, + (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), + (VMOVHPDZ128rm VR128X:$src1, addr:$src2)>; +} + +let SchedRW = [WriteFStore] in { +def VMOVHPSZ128mr : AVX512PSI<0x17, MRMDestMem, (outs), + (ins f64mem:$dst, VR128X:$src), + "vmovhps\t{$src, $dst|$dst, $src}", + [(store (f64 (extractelt + (X86Unpckh (bc_v2f64 (v4f32 VR128X:$src)), + (bc_v2f64 (v4f32 VR128X:$src))), + (iPTR 0))), addr:$dst)]>, + EVEX, EVEX_CD8<32, CD8VT2>; +def VMOVHPDZ128mr : AVX512PDI<0x17, MRMDestMem, (outs), + (ins f64mem:$dst, VR128X:$src), + "vmovhpd\t{$src, $dst|$dst, $src}", + [(store (f64 (extractelt + (v2f64 (X86Unpckh VR128X:$src, VR128X:$src)), + (iPTR 0))), addr:$dst)]>, + EVEX, EVEX_CD8<64, CD8VT1>, VEX_W; +def VMOVLPSZ128mr : AVX512PSI<0x13, MRMDestMem, (outs), + (ins f64mem:$dst, VR128X:$src), + "vmovlps\t{$src, $dst|$dst, $src}", + [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128X:$src)), + (iPTR 0))), addr:$dst)]>, + EVEX, EVEX_CD8<32, CD8VT2>; +def VMOVLPDZ128mr : AVX512PDI<0x13, MRMDestMem, (outs), + (ins f64mem:$dst, VR128X:$src), + "vmovlpd\t{$src, $dst|$dst, $src}", + [(store (f64 (extractelt (v2f64 VR128X:$src), + (iPTR 0))), addr:$dst)]>, + EVEX, EVEX_CD8<64, CD8VT1>, VEX_W; +} // SchedRW + +let Predicates = [HasAVX512] in { + // VMOVHPD patterns + def : Pat<(store (f64 (extractelt + (v2f64 (X86VPermilpi VR128X:$src, (i8 1))), + (iPTR 0))), addr:$dst), + (VMOVHPDZ128mr addr:$dst, VR128X:$src)>; +} +//===----------------------------------------------------------------------===// +// FMA - Fused Multiply Operations +// + +multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, + X86VectorVTInfo _, string Suff> { + let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in { + defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.RC:$src3), + OpcodeStr, "$src3, $src2", "$src2, $src3", + (_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)), 1, 1>, + AVX512FMA3Base, Sched<[sched]>; + + defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.MemOp:$src3), + OpcodeStr, "$src3, $src2", "$src2, $src3", + (_.VT (OpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3))), 1, 0>, + AVX512FMA3Base, Sched<[sched.Folded, ReadAfterLd]>; + + defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.ScalarMemOp:$src3), + OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), + !strconcat("$src2, ${src3}", _.BroadcastStr ), + (OpNode _.RC:$src2, + _.RC:$src1,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))), 1, 0>, + AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, ReadAfterLd]>; + } +} + +multiclass avx512_fma3_213_round<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, + X86VectorVTInfo _, string Suff> { + let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in + defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc), + OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", + (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 imm:$rc))), 1, 1>, + AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>; +} + +multiclass avx512_fma3p_213_common<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd, X86SchedWriteWidths sched, + AVX512VLVectorVTInfo _, string Suff> { + let Predicates = [HasAVX512] in { + defm Z : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, sched.ZMM, + _.info512, Suff>, + avx512_fma3_213_round<opc, OpcodeStr, OpNodeRnd, sched.ZMM, + _.info512, Suff>, + EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>; + } + let Predicates = [HasVLX, HasAVX512] in { + defm Z256 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, sched.YMM, + _.info256, Suff>, + EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>; + defm Z128 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, sched.XMM, + _.info128, Suff>, + EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>; + } +} + +multiclass avx512_fma3p_213_f<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd> { + defm PS : avx512_fma3p_213_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd, + SchedWriteFMA, avx512vl_f32_info, "PS">; + defm PD : avx512_fma3p_213_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd, + SchedWriteFMA, avx512vl_f64_info, "PD">, + VEX_W; +} + +defm VFMADD213 : avx512_fma3p_213_f<0xA8, "vfmadd213", X86Fmadd, X86FmaddRnd>; +defm VFMSUB213 : avx512_fma3p_213_f<0xAA, "vfmsub213", X86Fmsub, X86FmsubRnd>; +defm VFMADDSUB213 : avx512_fma3p_213_f<0xA6, "vfmaddsub213", X86Fmaddsub, X86FmaddsubRnd>; +defm VFMSUBADD213 : avx512_fma3p_213_f<0xA7, "vfmsubadd213", X86Fmsubadd, X86FmsubaddRnd>; +defm VFNMADD213 : avx512_fma3p_213_f<0xAC, "vfnmadd213", X86Fnmadd, X86FnmaddRnd>; +defm VFNMSUB213 : avx512_fma3p_213_f<0xAE, "vfnmsub213", X86Fnmsub, X86FnmsubRnd>; + + +multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, + X86VectorVTInfo _, string Suff> { + let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in { + defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.RC:$src3), + OpcodeStr, "$src3, $src2", "$src2, $src3", + (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), 1, 1, + vselect, 1>, AVX512FMA3Base, Sched<[sched]>; + + defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.MemOp:$src3), + OpcodeStr, "$src3, $src2", "$src2, $src3", + (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)), 1, 0>, + AVX512FMA3Base, Sched<[sched.Folded, ReadAfterLd]>; + + defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.ScalarMemOp:$src3), + OpcodeStr, "${src3}"##_.BroadcastStr##", $src2", + "$src2, ${src3}"##_.BroadcastStr, + (_.VT (OpNode _.RC:$src2, + (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))), + _.RC:$src1)), 1, 0>, AVX512FMA3Base, EVEX_B, + Sched<[sched.Folded, ReadAfterLd]>; + } +} + +multiclass avx512_fma3_231_round<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, + X86VectorVTInfo _, string Suff> { + let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in + defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc), + OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", + (_.VT ( OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 imm:$rc))), + 1, 1, vselect, 1>, + AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>; +} + +multiclass avx512_fma3p_231_common<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd, X86SchedWriteWidths sched, + AVX512VLVectorVTInfo _, string Suff> { + let Predicates = [HasAVX512] in { + defm Z : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, sched.ZMM, + _.info512, Suff>, + avx512_fma3_231_round<opc, OpcodeStr, OpNodeRnd, sched.ZMM, + _.info512, Suff>, + EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>; + } + let Predicates = [HasVLX, HasAVX512] in { + defm Z256 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, sched.YMM, + _.info256, Suff>, + EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>; + defm Z128 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, sched.XMM, + _.info128, Suff>, + EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>; + } +} + +multiclass avx512_fma3p_231_f<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd > { + defm PS : avx512_fma3p_231_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd, + SchedWriteFMA, avx512vl_f32_info, "PS">; + defm PD : avx512_fma3p_231_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd, + SchedWriteFMA, avx512vl_f64_info, "PD">, + VEX_W; +} + +defm VFMADD231 : avx512_fma3p_231_f<0xB8, "vfmadd231", X86Fmadd, X86FmaddRnd>; +defm VFMSUB231 : avx512_fma3p_231_f<0xBA, "vfmsub231", X86Fmsub, X86FmsubRnd>; +defm VFMADDSUB231 : avx512_fma3p_231_f<0xB6, "vfmaddsub231", X86Fmaddsub, X86FmaddsubRnd>; +defm VFMSUBADD231 : avx512_fma3p_231_f<0xB7, "vfmsubadd231", X86Fmsubadd, X86FmsubaddRnd>; +defm VFNMADD231 : avx512_fma3p_231_f<0xBC, "vfnmadd231", X86Fnmadd, X86FnmaddRnd>; +defm VFNMSUB231 : avx512_fma3p_231_f<0xBE, "vfnmsub231", X86Fnmsub, X86FnmsubRnd>; + +multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, + X86VectorVTInfo _, string Suff> { + let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in { + defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.RC:$src3), + OpcodeStr, "$src3, $src2", "$src2, $src3", + (_.VT (OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2)), 1, 1, vselect, 1>, + AVX512FMA3Base, Sched<[sched]>; + + // Pattern is 312 order so that the load is in a different place from the + // 213 and 231 patterns this helps tablegen's duplicate pattern detection. + defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.MemOp:$src3), + OpcodeStr, "$src3, $src2", "$src2, $src3", + (_.VT (OpNode (_.LdFrag addr:$src3), _.RC:$src1, _.RC:$src2)), 1, 0>, + AVX512FMA3Base, Sched<[sched.Folded, ReadAfterLd]>; + + // Pattern is 312 order so that the load is in a different place from the + // 213 and 231 patterns this helps tablegen's duplicate pattern detection. + defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.ScalarMemOp:$src3), + OpcodeStr, "${src3}"##_.BroadcastStr##", $src2", + "$src2, ${src3}"##_.BroadcastStr, + (_.VT (OpNode (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))), + _.RC:$src1, _.RC:$src2)), 1, 0>, + AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, ReadAfterLd]>; + } +} + +multiclass avx512_fma3_132_round<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, + X86VectorVTInfo _, string Suff> { + let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in + defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc), + OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", + (_.VT ( OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 imm:$rc))), + 1, 1, vselect, 1>, + AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>; +} + +multiclass avx512_fma3p_132_common<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd, X86SchedWriteWidths sched, + AVX512VLVectorVTInfo _, string Suff> { + let Predicates = [HasAVX512] in { + defm Z : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, sched.ZMM, + _.info512, Suff>, + avx512_fma3_132_round<opc, OpcodeStr, OpNodeRnd, sched.ZMM, + _.info512, Suff>, + EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>; + } + let Predicates = [HasVLX, HasAVX512] in { + defm Z256 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, sched.YMM, + _.info256, Suff>, + EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>; + defm Z128 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, sched.XMM, + _.info128, Suff>, + EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>; + } +} + +multiclass avx512_fma3p_132_f<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd > { + defm PS : avx512_fma3p_132_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd, + SchedWriteFMA, avx512vl_f32_info, "PS">; + defm PD : avx512_fma3p_132_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd, + SchedWriteFMA, avx512vl_f64_info, "PD">, + VEX_W; +} + +defm VFMADD132 : avx512_fma3p_132_f<0x98, "vfmadd132", X86Fmadd, X86FmaddRnd>; +defm VFMSUB132 : avx512_fma3p_132_f<0x9A, "vfmsub132", X86Fmsub, X86FmsubRnd>; +defm VFMADDSUB132 : avx512_fma3p_132_f<0x96, "vfmaddsub132", X86Fmaddsub, X86FmaddsubRnd>; +defm VFMSUBADD132 : avx512_fma3p_132_f<0x97, "vfmsubadd132", X86Fmsubadd, X86FmsubaddRnd>; +defm VFNMADD132 : avx512_fma3p_132_f<0x9C, "vfnmadd132", X86Fnmadd, X86FnmaddRnd>; +defm VFNMSUB132 : avx512_fma3p_132_f<0x9E, "vfnmsub132", X86Fnmsub, X86FnmsubRnd>; + +// Scalar FMA +multiclass avx512_fma3s_common<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + dag RHS_r, dag RHS_m, dag RHS_b, bit MaskOnlyReg> { +let Constraints = "$src1 = $dst", hasSideEffects = 0 in { + defm r_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.RC:$src3), OpcodeStr, + "$src3, $src2", "$src2, $src3", (null_frag), 1, 1>, + AVX512FMA3Base, Sched<[SchedWriteFMA.Scl]>; + + let mayLoad = 1 in + defm m_Int: AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.IntScalarMemOp:$src3), OpcodeStr, + "$src3, $src2", "$src2, $src3", (null_frag), 1, 1>, + AVX512FMA3Base, Sched<[SchedWriteFMA.Scl.Folded, ReadAfterLd]>; + + defm rb_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc), + OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", (null_frag), 1, 1>, + AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[SchedWriteFMA.Scl]>; + + let isCodeGenOnly = 1, isCommutable = 1 in { + def r : AVX512FMA3S<opc, MRMSrcReg, (outs _.FRC:$dst), + (ins _.FRC:$src1, _.FRC:$src2, _.FRC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !if(MaskOnlyReg, [], [RHS_r])>, Sched<[SchedWriteFMA.Scl]>; + def m : AVX512FMA3S<opc, MRMSrcMem, (outs _.FRC:$dst), + (ins _.FRC:$src1, _.FRC:$src2, _.ScalarMemOp:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [RHS_m]>, Sched<[SchedWriteFMA.Scl.Folded, ReadAfterLd]>; + + def rb : AVX512FMA3S<opc, MRMSrcReg, (outs _.FRC:$dst), + (ins _.FRC:$src1, _.FRC:$src2, _.FRC:$src3, AVX512RC:$rc), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !if(MaskOnlyReg, [], [RHS_b])>, EVEX_B, EVEX_RC, + Sched<[SchedWriteFMA.Scl]>; + }// isCodeGenOnly = 1 +}// Constraints = "$src1 = $dst" +} + +multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132, + string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd, + X86VectorVTInfo _, string SUFF> { + let ExeDomain = _.ExeDomain in { + defm NAME#213#SUFF#Z: avx512_fma3s_common<opc213, OpcodeStr#"213"#_.Suffix, _, + // Operands for intrinsic are in 123 order to preserve passthu + // semantics. + (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1, + _.FRC:$src3))), + (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1, + (_.ScalarLdFrag addr:$src3)))), + (set _.FRC:$dst, (_.EltVT (OpNodeRnd _.FRC:$src2, _.FRC:$src1, + _.FRC:$src3, (i32 imm:$rc)))), 0>; + + defm NAME#231#SUFF#Z: avx512_fma3s_common<opc231, OpcodeStr#"231"#_.Suffix, _, + (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src3, + _.FRC:$src1))), + (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, + (_.ScalarLdFrag addr:$src3), _.FRC:$src1))), + (set _.FRC:$dst, (_.EltVT (OpNodeRnd _.FRC:$src2, _.FRC:$src3, + _.FRC:$src1, (i32 imm:$rc)))), 1>; + + // One pattern is 312 order so that the load is in a different place from the + // 213 and 231 patterns this helps tablegen's duplicate pattern detection. + defm NAME#132#SUFF#Z: avx512_fma3s_common<opc132, OpcodeStr#"132"#_.Suffix, _, + (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1, _.FRC:$src3, + _.FRC:$src2))), + (set _.FRC:$dst, (_.EltVT (OpNode (_.ScalarLdFrag addr:$src3), + _.FRC:$src1, _.FRC:$src2))), + (set _.FRC:$dst, (_.EltVT (OpNodeRnd _.FRC:$src1, _.FRC:$src3, + _.FRC:$src2, (i32 imm:$rc)))), 1>; + } +} + +multiclass avx512_fma3s<bits<8> opc213, bits<8> opc231, bits<8> opc132, + string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd> { + let Predicates = [HasAVX512] in { + defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode, + OpNodeRnd, f32x_info, "SS">, + EVEX_CD8<32, CD8VT1>, VEX_LIG; + defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode, + OpNodeRnd, f64x_info, "SD">, + EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W; + } +} + +defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86Fmadd, X86FmaddRnd>; +defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86FmsubRnd>; +defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd, X86FnmaddRnd>; +defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, X86FnmsubRnd>; + +multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix, + string Suffix, SDNode Move, + X86VectorVTInfo _, PatLeaf ZeroFP> { + let Predicates = [HasAVX512] in { + def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector + (Op _.FRC:$src2, + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), + _.FRC:$src3))))), + (!cast<I>(Prefix#"213"#Suffix#"Zr_Int") + VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>; + + def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector + (Op _.FRC:$src2, _.FRC:$src3, + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))), + (!cast<I>(Prefix#"231"#Suffix#"Zr_Int") + VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>; + + def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector + (Op _.FRC:$src2, + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), + (_.ScalarLdFrag addr:$src3)))))), + (!cast<I>(Prefix#"213"#Suffix#"Zm_Int") + VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), + addr:$src3)>; + + def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector + (Op (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), + (_.ScalarLdFrag addr:$src3), _.FRC:$src2))))), + (!cast<I>(Prefix#"132"#Suffix#"Zm_Int") + VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), + addr:$src3)>; + + def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector + (Op _.FRC:$src2, (_.ScalarLdFrag addr:$src3), + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))), + (!cast<I>(Prefix#"231"#Suffix#"Zm_Int") + VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), + addr:$src3)>; + + def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector + (X86selects VK1WM:$mask, + (Op _.FRC:$src2, + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), + _.FRC:$src3), + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))), + (!cast<I>(Prefix#"213"#Suffix#"Zr_Intk") + VR128X:$src1, VK1WM:$mask, + (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>; + + def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector + (X86selects VK1WM:$mask, + (Op _.FRC:$src2, + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), + (_.ScalarLdFrag addr:$src3)), + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))), + (!cast<I>(Prefix#"213"#Suffix#"Zm_Intk") + VR128X:$src1, VK1WM:$mask, + (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>; + + def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector + (X86selects VK1WM:$mask, + (Op (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), + (_.ScalarLdFrag addr:$src3), _.FRC:$src2), + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))), + (!cast<I>(Prefix#"132"#Suffix#"Zm_Intk") + VR128X:$src1, VK1WM:$mask, + (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>; + + def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector + (X86selects VK1WM:$mask, + (Op _.FRC:$src2, _.FRC:$src3, + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))), + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))), + (!cast<I>(Prefix#"231"#Suffix#"Zr_Intk") + VR128X:$src1, VK1WM:$mask, + (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>; + + def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector + (X86selects VK1WM:$mask, + (Op _.FRC:$src2, (_.ScalarLdFrag addr:$src3), + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))), + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))), + (!cast<I>(Prefix#"231"#Suffix#"Zm_Intk") + VR128X:$src1, VK1WM:$mask, + (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>; + + def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector + (X86selects VK1WM:$mask, + (Op _.FRC:$src2, + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), + _.FRC:$src3), + (_.EltVT ZeroFP)))))), + (!cast<I>(Prefix#"213"#Suffix#"Zr_Intkz") + VR128X:$src1, VK1WM:$mask, + (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>; + + def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector + (X86selects VK1WM:$mask, + (Op _.FRC:$src2, _.FRC:$src3, + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))), + (_.EltVT ZeroFP)))))), + (!cast<I>(Prefix#"231"#Suffix#"Zr_Intkz") + VR128X:$src1, VK1WM:$mask, + (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>; + + def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector + (X86selects VK1WM:$mask, + (Op _.FRC:$src2, + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), + (_.ScalarLdFrag addr:$src3)), + (_.EltVT ZeroFP)))))), + (!cast<I>(Prefix#"213"#Suffix#"Zm_Intkz") + VR128X:$src1, VK1WM:$mask, + (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>; + + def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector + (X86selects VK1WM:$mask, + (Op (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), + _.FRC:$src2, (_.ScalarLdFrag addr:$src3)), + (_.EltVT ZeroFP)))))), + (!cast<I>(Prefix#"132"#Suffix#"Zm_Intkz") + VR128X:$src1, VK1WM:$mask, + (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>; + + def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector + (X86selects VK1WM:$mask, + (Op _.FRC:$src2, (_.ScalarLdFrag addr:$src3), + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))), + (_.EltVT ZeroFP)))))), + (!cast<I>(Prefix#"231"#Suffix#"Zm_Intkz") + VR128X:$src1, VK1WM:$mask, + (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>; + + // Patterns with rounding mode. + def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector + (RndOp _.FRC:$src2, + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), + _.FRC:$src3, (i32 imm:$rc)))))), + (!cast<I>(Prefix#"213"#Suffix#"Zrb_Int") + VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>; + + def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector + (RndOp _.FRC:$src2, _.FRC:$src3, + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), + (i32 imm:$rc)))))), + (!cast<I>(Prefix#"231"#Suffix#"Zrb_Int") + VR128X:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>; + + def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector + (X86selects VK1WM:$mask, + (RndOp _.FRC:$src2, + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), + _.FRC:$src3, (i32 imm:$rc)), + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))), + (!cast<I>(Prefix#"213"#Suffix#"Zrb_Intk") + VR128X:$src1, VK1WM:$mask, + (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>; + + def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector + (X86selects VK1WM:$mask, + (RndOp _.FRC:$src2, _.FRC:$src3, + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), + (i32 imm:$rc)), + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))), + (!cast<I>(Prefix#"231"#Suffix#"Zrb_Intk") + VR128X:$src1, VK1WM:$mask, + (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>; + + def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector + (X86selects VK1WM:$mask, + (RndOp _.FRC:$src2, + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), + _.FRC:$src3, (i32 imm:$rc)), + (_.EltVT ZeroFP)))))), + (!cast<I>(Prefix#"213"#Suffix#"Zrb_Intkz") + VR128X:$src1, VK1WM:$mask, + (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>; + + def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector + (X86selects VK1WM:$mask, + (RndOp _.FRC:$src2, _.FRC:$src3, + (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), + (i32 imm:$rc)), + (_.EltVT ZeroFP)))))), + (!cast<I>(Prefix#"231"#Suffix#"Zrb_Intkz") + VR128X:$src1, VK1WM:$mask, + (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), + (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), imm:$rc)>; + } +} + +defm : avx512_scalar_fma_patterns<X86Fmadd, X86FmaddRnd, "VFMADD", "SS", + X86Movss, v4f32x_info, fp32imm0>; +defm : avx512_scalar_fma_patterns<X86Fmsub, X86FmsubRnd, "VFMSUB", "SS", + X86Movss, v4f32x_info, fp32imm0>; +defm : avx512_scalar_fma_patterns<X86Fnmadd, X86FnmaddRnd, "VFNMADD", "SS", + X86Movss, v4f32x_info, fp32imm0>; +defm : avx512_scalar_fma_patterns<X86Fnmsub, X86FnmsubRnd, "VFNMSUB", "SS", + X86Movss, v4f32x_info, fp32imm0>; + +defm : avx512_scalar_fma_patterns<X86Fmadd, X86FmaddRnd, "VFMADD", "SD", + X86Movsd, v2f64x_info, fp64imm0>; +defm : avx512_scalar_fma_patterns<X86Fmsub, X86FmsubRnd, "VFMSUB", "SD", + X86Movsd, v2f64x_info, fp64imm0>; +defm : avx512_scalar_fma_patterns<X86Fnmadd, X86FnmaddRnd, "VFNMADD", "SD", + X86Movsd, v2f64x_info, fp64imm0>; +defm : avx512_scalar_fma_patterns<X86Fnmsub, X86FnmsubRnd, "VFNMSUB", "SD", + X86Movsd, v2f64x_info, fp64imm0>; + +//===----------------------------------------------------------------------===// +// AVX-512 Packed Multiply of Unsigned 52-bit Integers and Add the Low 52-bit IFMA +//===----------------------------------------------------------------------===// +let Constraints = "$src1 = $dst" in { +multiclass avx512_pmadd52_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, X86VectorVTInfo _> { + // NOTE: The SDNode have the multiply operands first with the add last. + // This enables commuted load patterns to be autogenerated by tablegen. + let ExeDomain = _.ExeDomain in { + defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.RC:$src3), + OpcodeStr, "$src3, $src2", "$src2, $src3", + (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), 1, 1>, + AVX512FMA3Base, Sched<[sched]>; + + defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.MemOp:$src3), + OpcodeStr, "$src3, $src2", "$src2, $src3", + (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1))>, + AVX512FMA3Base, Sched<[sched.Folded, ReadAfterLd]>; + + defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.ScalarMemOp:$src3), + OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), + !strconcat("$src2, ${src3}", _.BroadcastStr ), + (OpNode _.RC:$src2, + (_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))), + _.RC:$src1)>, + AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, ReadAfterLd]>; + } +} +} // Constraints = "$src1 = $dst" + +multiclass avx512_pmadd52_common<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86SchedWriteWidths sched, AVX512VLVectorVTInfo _> { + let Predicates = [HasIFMA] in { + defm Z : avx512_pmadd52_rm<opc, OpcodeStr, OpNode, sched.ZMM, _.info512>, + EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>; + } + let Predicates = [HasVLX, HasIFMA] in { + defm Z256 : avx512_pmadd52_rm<opc, OpcodeStr, OpNode, sched.YMM, _.info256>, + EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>; + defm Z128 : avx512_pmadd52_rm<opc, OpcodeStr, OpNode, sched.XMM, _.info128>, + EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>; + } +} + +defm VPMADD52LUQ : avx512_pmadd52_common<0xb4, "vpmadd52luq", x86vpmadd52l, + SchedWriteVecIMul, avx512vl_i64_info>, + VEX_W; +defm VPMADD52HUQ : avx512_pmadd52_common<0xb5, "vpmadd52huq", x86vpmadd52h, + SchedWriteVecIMul, avx512vl_i64_info>, + VEX_W; + +//===----------------------------------------------------------------------===// +// AVX-512 Scalar convert from sign integer to float/double +//===----------------------------------------------------------------------===// + +multiclass avx512_vcvtsi<bits<8> opc, SDNode OpNode, X86FoldableSchedWrite sched, + RegisterClass SrcRC, X86VectorVTInfo DstVT, + X86MemOperand x86memop, PatFrag ld_frag, string asm> { + let hasSideEffects = 0 in { + def rr : SI<opc, MRMSrcReg, (outs DstVT.FRC:$dst), + (ins DstVT.FRC:$src1, SrcRC:$src), + !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, + EVEX_4V, Sched<[sched]>; + let mayLoad = 1 in + def rm : SI<opc, MRMSrcMem, (outs DstVT.FRC:$dst), + (ins DstVT.FRC:$src1, x86memop:$src), + !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, + EVEX_4V, Sched<[sched.Folded, ReadAfterLd]>; + } // hasSideEffects = 0 + let isCodeGenOnly = 1 in { + def rr_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), + (ins DstVT.RC:$src1, SrcRC:$src2), + !strconcat(asm,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set DstVT.RC:$dst, + (OpNode (DstVT.VT DstVT.RC:$src1), + SrcRC:$src2, + (i32 FROUND_CURRENT)))]>, + EVEX_4V, Sched<[sched]>; + + def rm_Int : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst), + (ins DstVT.RC:$src1, x86memop:$src2), + !strconcat(asm,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set DstVT.RC:$dst, + (OpNode (DstVT.VT DstVT.RC:$src1), + (ld_frag addr:$src2), + (i32 FROUND_CURRENT)))]>, + EVEX_4V, Sched<[sched.Folded, ReadAfterLd]>; + }//isCodeGenOnly = 1 +} + +multiclass avx512_vcvtsi_round<bits<8> opc, SDNode OpNode, + X86FoldableSchedWrite sched, RegisterClass SrcRC, + X86VectorVTInfo DstVT, string asm> { + def rrb_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), + (ins DstVT.RC:$src1, SrcRC:$src2, AVX512RC:$rc), + !strconcat(asm, + "\t{$src2, $rc, $src1, $dst|$dst, $src1, $rc, $src2}"), + [(set DstVT.RC:$dst, + (OpNode (DstVT.VT DstVT.RC:$src1), + SrcRC:$src2, + (i32 imm:$rc)))]>, + EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched]>; +} + +multiclass avx512_vcvtsi_common<bits<8> opc, SDNode OpNode, + X86FoldableSchedWrite sched, + RegisterClass SrcRC, X86VectorVTInfo DstVT, + X86MemOperand x86memop, PatFrag ld_frag, string asm> { + defm NAME : avx512_vcvtsi_round<opc, OpNode, sched, SrcRC, DstVT, asm>, + avx512_vcvtsi<opc, OpNode, sched, SrcRC, DstVT, x86memop, + ld_frag, asm>, VEX_LIG; +} + +let Predicates = [HasAVX512] in { +defm VCVTSI2SSZ : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, WriteCvtI2SS, GR32, + v4f32x_info, i32mem, loadi32, "cvtsi2ss{l}">, + XS, EVEX_CD8<32, CD8VT1>; +defm VCVTSI642SSZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, WriteCvtI2SS, GR64, + v4f32x_info, i64mem, loadi64, "cvtsi2ss{q}">, + XS, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VCVTSI2SDZ : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, WriteCvtI2SD, GR32, + v2f64x_info, i32mem, loadi32, "cvtsi2sd{l}">, + XD, EVEX_CD8<32, CD8VT1>; +defm VCVTSI642SDZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, WriteCvtI2SD, GR64, + v2f64x_info, i64mem, loadi64, "cvtsi2sd{q}">, + XD, VEX_W, EVEX_CD8<64, CD8VT1>; + +def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}", + (VCVTSI2SSZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0, "att">; +def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}", + (VCVTSI2SDZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0, "att">; + +def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))), + (VCVTSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>; +def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))), + (VCVTSI642SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>; +def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))), + (VCVTSI2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>; +def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))), + (VCVTSI642SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>; + +def : Pat<(f32 (sint_to_fp GR32:$src)), + (VCVTSI2SSZrr (f32 (IMPLICIT_DEF)), GR32:$src)>; +def : Pat<(f32 (sint_to_fp GR64:$src)), + (VCVTSI642SSZrr (f32 (IMPLICIT_DEF)), GR64:$src)>; +def : Pat<(f64 (sint_to_fp GR32:$src)), + (VCVTSI2SDZrr (f64 (IMPLICIT_DEF)), GR32:$src)>; +def : Pat<(f64 (sint_to_fp GR64:$src)), + (VCVTSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>; + +defm VCVTUSI2SSZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, WriteCvtI2SS, GR32, + v4f32x_info, i32mem, loadi32, + "cvtusi2ss{l}">, XS, EVEX_CD8<32, CD8VT1>; +defm VCVTUSI642SSZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, WriteCvtI2SS, GR64, + v4f32x_info, i64mem, loadi64, "cvtusi2ss{q}">, + XS, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VCVTUSI2SDZ : avx512_vcvtsi<0x7B, X86UintToFpRnd, WriteCvtI2SD, GR32, v2f64x_info, + i32mem, loadi32, "cvtusi2sd{l}">, + XD, VEX_LIG, EVEX_CD8<32, CD8VT1>; +defm VCVTUSI642SDZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, WriteCvtI2SD, GR64, + v2f64x_info, i64mem, loadi64, "cvtusi2sd{q}">, + XD, VEX_W, EVEX_CD8<64, CD8VT1>; + +def : InstAlias<"vcvtusi2ss\t{$src, $src1, $dst|$dst, $src1, $src}", + (VCVTUSI2SSZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0, "att">; +def : InstAlias<"vcvtusi2sd\t{$src, $src1, $dst|$dst, $src1, $src}", + (VCVTUSI2SDZrm FR64X:$dst, FR64X:$src1, i32mem:$src), 0, "att">; + +def : Pat<(f32 (uint_to_fp (loadi32 addr:$src))), + (VCVTUSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>; +def : Pat<(f32 (uint_to_fp (loadi64 addr:$src))), + (VCVTUSI642SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>; +def : Pat<(f64 (uint_to_fp (loadi32 addr:$src))), + (VCVTUSI2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>; +def : Pat<(f64 (uint_to_fp (loadi64 addr:$src))), + (VCVTUSI642SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>; + +def : Pat<(f32 (uint_to_fp GR32:$src)), + (VCVTUSI2SSZrr (f32 (IMPLICIT_DEF)), GR32:$src)>; +def : Pat<(f32 (uint_to_fp GR64:$src)), + (VCVTUSI642SSZrr (f32 (IMPLICIT_DEF)), GR64:$src)>; +def : Pat<(f64 (uint_to_fp GR32:$src)), + (VCVTUSI2SDZrr (f64 (IMPLICIT_DEF)), GR32:$src)>; +def : Pat<(f64 (uint_to_fp GR64:$src)), + (VCVTUSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>; +} + +//===----------------------------------------------------------------------===// +// AVX-512 Scalar convert from float/double to integer +//===----------------------------------------------------------------------===// + +multiclass avx512_cvt_s_int_round<bits<8> opc, X86VectorVTInfo SrcVT, + X86VectorVTInfo DstVT, SDNode OpNode, + X86FoldableSchedWrite sched, string asm, + string aliasStr, + bit CodeGenOnly = 1> { + let Predicates = [HasAVX512] in { + def rr_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.RC:$src), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), + [(set DstVT.RC:$dst, (OpNode (SrcVT.VT SrcVT.RC:$src),(i32 FROUND_CURRENT)))]>, + EVEX, VEX_LIG, Sched<[sched]>; + def rrb_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.RC:$src, AVX512RC:$rc), + !strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"), + [(set DstVT.RC:$dst, (OpNode (SrcVT.VT SrcVT.RC:$src),(i32 imm:$rc)))]>, + EVEX, VEX_LIG, EVEX_B, EVEX_RC, + Sched<[sched]>; + let isCodeGenOnly = CodeGenOnly, ForceDisassemble = CodeGenOnly in + def rm_Int : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst), (ins SrcVT.IntScalarMemOp:$src), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), + [(set DstVT.RC:$dst, (OpNode + (SrcVT.VT SrcVT.ScalarIntMemCPat:$src), + (i32 FROUND_CURRENT)))]>, + EVEX, VEX_LIG, Sched<[sched.Folded, ReadAfterLd]>; + + def : InstAlias<"v" # asm # aliasStr # "\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "rr_Int") DstVT.RC:$dst, SrcVT.RC:$src), 0, "att">; + def : InstAlias<"v" # asm # aliasStr # "\t{$rc, $src, $dst|$dst, $src, $rc}", + (!cast<Instruction>(NAME # "rrb_Int") DstVT.RC:$dst, SrcVT.RC:$src, AVX512RC:$rc), 0, "att">; + } // Predicates = [HasAVX512] +} + +multiclass avx512_cvt_s_int_round_aliases<bits<8> opc, X86VectorVTInfo SrcVT, + X86VectorVTInfo DstVT, SDNode OpNode, + X86FoldableSchedWrite sched, string asm, + string aliasStr> : + avx512_cvt_s_int_round<opc, SrcVT, DstVT, OpNode, sched, asm, aliasStr, 0> { + let Predicates = [HasAVX512] in { + def : InstAlias<"v" # asm # aliasStr # "\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "rm_Int") DstVT.RC:$dst, + SrcVT.IntScalarMemOp:$src), 0, "att">; + } // Predicates = [HasAVX512] +} + +// Convert float/double to signed/unsigned int 32/64 +defm VCVTSS2SIZ: avx512_cvt_s_int_round<0x2D, f32x_info, i32x_info, + X86cvts2si, WriteCvtSS2I, "cvtss2si", "{l}">, + XS, EVEX_CD8<32, CD8VT1>; +defm VCVTSS2SI64Z: avx512_cvt_s_int_round<0x2D, f32x_info, i64x_info, + X86cvts2si, WriteCvtSS2I, "cvtss2si", "{q}">, + XS, VEX_W, EVEX_CD8<32, CD8VT1>; +defm VCVTSS2USIZ: avx512_cvt_s_int_round_aliases<0x79, f32x_info, i32x_info, + X86cvts2usi, WriteCvtSS2I, "cvtss2usi", "{l}">, + XS, EVEX_CD8<32, CD8VT1>; +defm VCVTSS2USI64Z: avx512_cvt_s_int_round_aliases<0x79, f32x_info, i64x_info, + X86cvts2usi, WriteCvtSS2I, "cvtss2usi", "{q}">, + XS, VEX_W, EVEX_CD8<32, CD8VT1>; +defm VCVTSD2SIZ: avx512_cvt_s_int_round<0x2D, f64x_info, i32x_info, + X86cvts2si, WriteCvtSD2I, "cvtsd2si", "{l}">, + XD, EVEX_CD8<64, CD8VT1>; +defm VCVTSD2SI64Z: avx512_cvt_s_int_round<0x2D, f64x_info, i64x_info, + X86cvts2si, WriteCvtSD2I, "cvtsd2si", "{q}">, + XD, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VCVTSD2USIZ: avx512_cvt_s_int_round_aliases<0x79, f64x_info, i32x_info, + X86cvts2usi, WriteCvtSD2I, "cvtsd2usi", "{l}">, + XD, EVEX_CD8<64, CD8VT1>; +defm VCVTSD2USI64Z: avx512_cvt_s_int_round_aliases<0x79, f64x_info, i64x_info, + X86cvts2usi, WriteCvtSD2I, "cvtsd2usi", "{q}">, + XD, VEX_W, EVEX_CD8<64, CD8VT1>; + +// The SSE version of these instructions are disabled for AVX512. +// Therefore, the SSE intrinsics are mapped to the AVX512 instructions. +let Predicates = [HasAVX512] in { + def : Pat<(i32 (int_x86_sse_cvtss2si (v4f32 VR128X:$src))), + (VCVTSS2SIZrr_Int VR128X:$src)>; + def : Pat<(i32 (int_x86_sse_cvtss2si sse_load_f32:$src)), + (VCVTSS2SIZrm_Int sse_load_f32:$src)>; + def : Pat<(i64 (int_x86_sse_cvtss2si64 (v4f32 VR128X:$src))), + (VCVTSS2SI64Zrr_Int VR128X:$src)>; + def : Pat<(i64 (int_x86_sse_cvtss2si64 sse_load_f32:$src)), + (VCVTSS2SI64Zrm_Int sse_load_f32:$src)>; + def : Pat<(i32 (int_x86_sse2_cvtsd2si (v2f64 VR128X:$src))), + (VCVTSD2SIZrr_Int VR128X:$src)>; + def : Pat<(i32 (int_x86_sse2_cvtsd2si sse_load_f64:$src)), + (VCVTSD2SIZrm_Int sse_load_f64:$src)>; + def : Pat<(i64 (int_x86_sse2_cvtsd2si64 (v2f64 VR128X:$src))), + (VCVTSD2SI64Zrr_Int VR128X:$src)>; + def : Pat<(i64 (int_x86_sse2_cvtsd2si64 sse_load_f64:$src)), + (VCVTSD2SI64Zrm_Int sse_load_f64:$src)>; +} // HasAVX512 + +// Patterns used for matching vcvtsi2s{s,d} intrinsic sequences from clang +// which produce unnecessary vmovs{s,d} instructions +let Predicates = [HasAVX512] in { +def : Pat<(v4f32 (X86Movss + (v4f32 VR128X:$dst), + (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))), + (VCVTSI642SSZrr_Int VR128X:$dst, GR64:$src)>; + +def : Pat<(v4f32 (X86Movss + (v4f32 VR128X:$dst), + (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi64 addr:$src))))))), + (VCVTSI642SSZrm_Int VR128X:$dst, addr:$src)>; + +def : Pat<(v4f32 (X86Movss + (v4f32 VR128X:$dst), + (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))), + (VCVTSI2SSZrr_Int VR128X:$dst, GR32:$src)>; + +def : Pat<(v4f32 (X86Movss + (v4f32 VR128X:$dst), + (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi32 addr:$src))))))), + (VCVTSI2SSZrm_Int VR128X:$dst, addr:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128X:$dst), + (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))), + (VCVTSI642SDZrr_Int VR128X:$dst, GR64:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128X:$dst), + (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi64 addr:$src))))))), + (VCVTSI642SDZrm_Int VR128X:$dst, addr:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128X:$dst), + (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))), + (VCVTSI2SDZrr_Int VR128X:$dst, GR32:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128X:$dst), + (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi32 addr:$src))))))), + (VCVTSI2SDZrm_Int VR128X:$dst, addr:$src)>; + +def : Pat<(v4f32 (X86Movss + (v4f32 VR128X:$dst), + (v4f32 (scalar_to_vector (f32 (uint_to_fp GR64:$src)))))), + (VCVTUSI642SSZrr_Int VR128X:$dst, GR64:$src)>; + +def : Pat<(v4f32 (X86Movss + (v4f32 VR128X:$dst), + (v4f32 (scalar_to_vector (f32 (uint_to_fp (loadi64 addr:$src))))))), + (VCVTUSI642SSZrm_Int VR128X:$dst, addr:$src)>; + +def : Pat<(v4f32 (X86Movss + (v4f32 VR128X:$dst), + (v4f32 (scalar_to_vector (f32 (uint_to_fp GR32:$src)))))), + (VCVTUSI2SSZrr_Int VR128X:$dst, GR32:$src)>; + +def : Pat<(v4f32 (X86Movss + (v4f32 VR128X:$dst), + (v4f32 (scalar_to_vector (f32 (uint_to_fp (loadi32 addr:$src))))))), + (VCVTUSI2SSZrm_Int VR128X:$dst, addr:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128X:$dst), + (v2f64 (scalar_to_vector (f64 (uint_to_fp GR64:$src)))))), + (VCVTUSI642SDZrr_Int VR128X:$dst, GR64:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128X:$dst), + (v2f64 (scalar_to_vector (f64 (uint_to_fp (loadi64 addr:$src))))))), + (VCVTUSI642SDZrm_Int VR128X:$dst, addr:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128X:$dst), + (v2f64 (scalar_to_vector (f64 (uint_to_fp GR32:$src)))))), + (VCVTUSI2SDZrr_Int VR128X:$dst, GR32:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128X:$dst), + (v2f64 (scalar_to_vector (f64 (uint_to_fp (loadi32 addr:$src))))))), + (VCVTUSI2SDZrm_Int VR128X:$dst, addr:$src)>; +} // Predicates = [HasAVX512] + +// Convert float/double to signed/unsigned int 32/64 with truncation +multiclass avx512_cvt_s_all<bits<8> opc, string asm, X86VectorVTInfo _SrcRC, + X86VectorVTInfo _DstRC, SDNode OpNode, + SDNode OpNodeRnd, X86FoldableSchedWrite sched, + string aliasStr, bit CodeGenOnly = 1>{ +let Predicates = [HasAVX512] in { + let isCodeGenOnly = 1 in { + def rr : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), + [(set _DstRC.RC:$dst, (OpNode _SrcRC.FRC:$src))]>, + EVEX, Sched<[sched]>; + def rm : AVX512<opc, MRMSrcMem, (outs _DstRC.RC:$dst), (ins _SrcRC.ScalarMemOp:$src), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), + [(set _DstRC.RC:$dst, (OpNode (_SrcRC.ScalarLdFrag addr:$src)))]>, + EVEX, Sched<[sched.Folded, ReadAfterLd]>; + } + + def rr_Int : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), + [(set _DstRC.RC:$dst, (OpNodeRnd (_SrcRC.VT _SrcRC.RC:$src), + (i32 FROUND_CURRENT)))]>, + EVEX, VEX_LIG, Sched<[sched]>; + def rrb_Int : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src), + !strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"), + [(set _DstRC.RC:$dst, (OpNodeRnd (_SrcRC.VT _SrcRC.RC:$src), + (i32 FROUND_NO_EXC)))]>, + EVEX,VEX_LIG , EVEX_B, Sched<[sched]>; + let isCodeGenOnly = CodeGenOnly, ForceDisassemble = CodeGenOnly in + def rm_Int : AVX512<opc, MRMSrcMem, (outs _DstRC.RC:$dst), + (ins _SrcRC.IntScalarMemOp:$src), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), + [(set _DstRC.RC:$dst, (OpNodeRnd + (_SrcRC.VT _SrcRC.ScalarIntMemCPat:$src), + (i32 FROUND_CURRENT)))]>, + EVEX, VEX_LIG, Sched<[sched.Folded, ReadAfterLd]>; + + def : InstAlias<asm # aliasStr # "\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "rr_Int") _DstRC.RC:$dst, _SrcRC.RC:$src), 0, "att">; + def : InstAlias<asm # aliasStr # "\t{{sae}, $src, $dst|$dst, $src, {sae}}", + (!cast<Instruction>(NAME # "rrb_Int") _DstRC.RC:$dst, _SrcRC.RC:$src), 0, "att">; +} //HasAVX512 +} + +multiclass avx512_cvt_s_all_unsigned<bits<8> opc, string asm, + X86VectorVTInfo _SrcRC, + X86VectorVTInfo _DstRC, SDNode OpNode, + SDNode OpNodeRnd, X86FoldableSchedWrite sched, + string aliasStr> : + avx512_cvt_s_all<opc, asm, _SrcRC, _DstRC, OpNode, OpNodeRnd, sched, + aliasStr, 0> { +let Predicates = [HasAVX512] in { + def : InstAlias<asm # aliasStr # "\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "rm_Int") _DstRC.RC:$dst, + _SrcRC.IntScalarMemOp:$src), 0, "att">; +} +} + +defm VCVTTSS2SIZ: avx512_cvt_s_all<0x2C, "vcvttss2si", f32x_info, i32x_info, + fp_to_sint, X86cvtts2IntRnd, WriteCvtSS2I, "{l}">, + XS, EVEX_CD8<32, CD8VT1>; +defm VCVTTSS2SI64Z: avx512_cvt_s_all<0x2C, "vcvttss2si", f32x_info, i64x_info, + fp_to_sint, X86cvtts2IntRnd, WriteCvtSS2I, "{q}">, + VEX_W, XS, EVEX_CD8<32, CD8VT1>; +defm VCVTTSD2SIZ: avx512_cvt_s_all<0x2C, "vcvttsd2si", f64x_info, i32x_info, + fp_to_sint, X86cvtts2IntRnd, WriteCvtSD2I, "{l}">, + XD, EVEX_CD8<64, CD8VT1>; +defm VCVTTSD2SI64Z: avx512_cvt_s_all<0x2C, "vcvttsd2si", f64x_info, i64x_info, + fp_to_sint, X86cvtts2IntRnd, WriteCvtSD2I, "{q}">, + VEX_W, XD, EVEX_CD8<64, CD8VT1>; + +defm VCVTTSS2USIZ: avx512_cvt_s_all_unsigned<0x78, "vcvttss2usi", f32x_info, i32x_info, + fp_to_uint, X86cvtts2UIntRnd, WriteCvtSS2I, "{l}">, + XS, EVEX_CD8<32, CD8VT1>; +defm VCVTTSS2USI64Z: avx512_cvt_s_all_unsigned<0x78, "vcvttss2usi", f32x_info, i64x_info, + fp_to_uint, X86cvtts2UIntRnd, WriteCvtSS2I, "{q}">, + XS,VEX_W, EVEX_CD8<32, CD8VT1>; +defm VCVTTSD2USIZ: avx512_cvt_s_all_unsigned<0x78, "vcvttsd2usi", f64x_info, i32x_info, + fp_to_uint, X86cvtts2UIntRnd, WriteCvtSD2I, "{l}">, + XD, EVEX_CD8<64, CD8VT1>; +defm VCVTTSD2USI64Z: avx512_cvt_s_all_unsigned<0x78, "vcvttsd2usi", f64x_info, i64x_info, + fp_to_uint, X86cvtts2UIntRnd, WriteCvtSD2I, "{q}">, + XD, VEX_W, EVEX_CD8<64, CD8VT1>; + +let Predicates = [HasAVX512] in { + def : Pat<(i32 (int_x86_sse_cvttss2si (v4f32 VR128X:$src))), + (VCVTTSS2SIZrr_Int VR128X:$src)>; + def : Pat<(i32 (int_x86_sse_cvttss2si sse_load_f32:$src)), + (VCVTTSS2SIZrm_Int ssmem:$src)>; + def : Pat<(i64 (int_x86_sse_cvttss2si64 (v4f32 VR128X:$src))), + (VCVTTSS2SI64Zrr_Int VR128X:$src)>; + def : Pat<(i64 (int_x86_sse_cvttss2si64 sse_load_f32:$src)), + (VCVTTSS2SI64Zrm_Int ssmem:$src)>; + def : Pat<(i32 (int_x86_sse2_cvttsd2si (v2f64 VR128X:$src))), + (VCVTTSD2SIZrr_Int VR128X:$src)>; + def : Pat<(i32 (int_x86_sse2_cvttsd2si sse_load_f64:$src)), + (VCVTTSD2SIZrm_Int sdmem:$src)>; + def : Pat<(i64 (int_x86_sse2_cvttsd2si64 (v2f64 VR128X:$src))), + (VCVTTSD2SI64Zrr_Int VR128X:$src)>; + def : Pat<(i64 (int_x86_sse2_cvttsd2si64 sse_load_f64:$src)), + (VCVTTSD2SI64Zrm_Int sdmem:$src)>; +} // HasAVX512 + +//===----------------------------------------------------------------------===// +// AVX-512 Convert form float to double and back +//===----------------------------------------------------------------------===// + +multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + X86VectorVTInfo _Src, SDNode OpNode, + X86FoldableSchedWrite sched> { + defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _Src.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode (_.VT _.RC:$src1), + (_Src.VT _Src.RC:$src2), + (i32 FROUND_CURRENT)))>, + EVEX_4V, VEX_LIG, Sched<[sched]>; + defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _Src.IntScalarMemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode (_.VT _.RC:$src1), + (_Src.VT _Src.ScalarIntMemCPat:$src2), + (i32 FROUND_CURRENT)))>, + EVEX_4V, VEX_LIG, + Sched<[sched.Folded, ReadAfterLd]>; + + let isCodeGenOnly = 1, hasSideEffects = 0 in { + def rr : I<opc, MRMSrcReg, (outs _.FRC:$dst), + (ins _.FRC:$src1, _Src.FRC:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, + EVEX_4V, VEX_LIG, Sched<[sched]>; + let mayLoad = 1 in + def rm : I<opc, MRMSrcMem, (outs _.FRC:$dst), + (ins _.FRC:$src1, _Src.ScalarMemOp:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, + EVEX_4V, VEX_LIG, Sched<[sched.Folded, ReadAfterLd]>; + } +} + +// Scalar Coversion with SAE - suppress all exceptions +multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + X86VectorVTInfo _Src, SDNode OpNodeRnd, + X86FoldableSchedWrite sched> { + defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _Src.RC:$src2), OpcodeStr, + "{sae}, $src2, $src1", "$src1, $src2, {sae}", + (_.VT (OpNodeRnd (_.VT _.RC:$src1), + (_Src.VT _Src.RC:$src2), + (i32 FROUND_NO_EXC)))>, + EVEX_4V, VEX_LIG, EVEX_B, Sched<[sched]>; +} + +// Scalar Conversion with rounding control (RC) +multiclass avx512_cvt_fp_rc_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + X86VectorVTInfo _Src, SDNode OpNodeRnd, + X86FoldableSchedWrite sched> { + defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _Src.RC:$src2, AVX512RC:$rc), OpcodeStr, + "$rc, $src2, $src1", "$src1, $src2, $rc", + (_.VT (OpNodeRnd (_.VT _.RC:$src1), + (_Src.VT _Src.RC:$src2), (i32 imm:$rc)))>, + EVEX_4V, VEX_LIG, Sched<[sched]>, + EVEX_B, EVEX_RC; +} +multiclass avx512_cvt_fp_scalar_sd2ss<bits<8> opc, string OpcodeStr, + SDNode OpNodeRnd, X86FoldableSchedWrite sched, + X86VectorVTInfo _src, X86VectorVTInfo _dst> { + let Predicates = [HasAVX512] in { + defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd, sched>, + avx512_cvt_fp_rc_scalar<opc, OpcodeStr, _dst, _src, + OpNodeRnd, sched>, VEX_W, EVEX_CD8<64, CD8VT1>, XD; + } +} + +multiclass avx512_cvt_fp_scalar_ss2sd<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd, + X86FoldableSchedWrite sched, + X86VectorVTInfo _src, X86VectorVTInfo _dst> { + let Predicates = [HasAVX512] in { + defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd, sched>, + avx512_cvt_fp_sae_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd, sched>, + EVEX_CD8<32, CD8VT1>, XS; + } +} +defm VCVTSD2SS : avx512_cvt_fp_scalar_sd2ss<0x5A, "vcvtsd2ss", + X86froundRnd, WriteCvtSD2SS, f64x_info, + f32x_info>; +defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd", + X86fpextRnd, WriteCvtSS2SD, f32x_info, + f64x_info>; + +def : Pat<(f64 (fpextend FR32X:$src)), + (VCVTSS2SDZrr (f64 (IMPLICIT_DEF)), FR32X:$src)>, + Requires<[HasAVX512]>; +def : Pat<(f64 (fpextend (loadf32 addr:$src))), + (VCVTSS2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>, + Requires<[HasAVX512, OptForSize]>; + +def : Pat<(f64 (extloadf32 addr:$src)), + (VCVTSS2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>, + Requires<[HasAVX512, OptForSize]>; + +def : Pat<(f64 (extloadf32 addr:$src)), + (VCVTSS2SDZrr (f64 (IMPLICIT_DEF)), (VMOVSSZrm addr:$src))>, + Requires<[HasAVX512, OptForSpeed]>; + +def : Pat<(f32 (fpround FR64X:$src)), + (VCVTSD2SSZrr (f32 (IMPLICIT_DEF)), FR64X:$src)>, + Requires<[HasAVX512]>; + +def : Pat<(v4f32 (X86Movss + (v4f32 VR128X:$dst), + (v4f32 (scalar_to_vector + (f32 (fpround (f64 (extractelt VR128X:$src, (iPTR 0))))))))), + (VCVTSD2SSZrr_Int VR128X:$dst, VR128X:$src)>, + Requires<[HasAVX512]>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128X:$dst), + (v2f64 (scalar_to_vector + (f64 (fpextend (f32 (extractelt VR128X:$src, (iPTR 0))))))))), + (VCVTSS2SDZrr_Int VR128X:$dst, VR128X:$src)>, + Requires<[HasAVX512]>; + +//===----------------------------------------------------------------------===// +// AVX-512 Vector convert from signed/unsigned integer to float/double +// and from float/double to signed/unsigned integer +//===----------------------------------------------------------------------===// + +multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + X86VectorVTInfo _Src, SDNode OpNode, + X86FoldableSchedWrite sched, + string Broadcast = _.BroadcastStr, + string Alias = "", X86MemOperand MemOp = _Src.MemOp> { + + defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _Src.RC:$src), OpcodeStr, "$src", "$src", + (_.VT (OpNode (_Src.VT _Src.RC:$src)))>, + EVEX, Sched<[sched]>; + + defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins MemOp:$src), OpcodeStr#Alias, "$src", "$src", + (_.VT (OpNode (_Src.VT + (bitconvert (_Src.LdFrag addr:$src)))))>, + EVEX, Sched<[sched.Folded]>; + + defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _Src.ScalarMemOp:$src), OpcodeStr, + "${src}"##Broadcast, "${src}"##Broadcast, + (_.VT (OpNode (_Src.VT + (X86VBroadcast (_Src.ScalarLdFrag addr:$src))) + ))>, EVEX, EVEX_B, + Sched<[sched.Folded]>; +} +// Coversion with SAE - suppress all exceptions +multiclass avx512_vcvt_fp_sae<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + X86VectorVTInfo _Src, SDNode OpNodeRnd, + X86FoldableSchedWrite sched> { + defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _Src.RC:$src), OpcodeStr, + "{sae}, $src", "$src, {sae}", + (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src), + (i32 FROUND_NO_EXC)))>, + EVEX, EVEX_B, Sched<[sched]>; +} + +// Conversion with rounding control (RC) +multiclass avx512_vcvt_fp_rc<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + X86VectorVTInfo _Src, SDNode OpNodeRnd, + X86FoldableSchedWrite sched> { + defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _Src.RC:$src, AVX512RC:$rc), OpcodeStr, + "$rc, $src", "$src, $rc", + (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src), (i32 imm:$rc)))>, + EVEX, EVEX_B, EVEX_RC, Sched<[sched]>; +} + +// Extend Float to Double +multiclass avx512_cvtps2pd<bits<8> opc, string OpcodeStr, + X86SchedWriteWidths sched> { + let Predicates = [HasAVX512] in { + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8f32x_info, + fpextend, sched.ZMM>, + avx512_vcvt_fp_sae<opc, OpcodeStr, v8f64_info, v8f32x_info, + X86vfpextRnd, sched.ZMM>, EVEX_V512; + } + let Predicates = [HasVLX] in { + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4f32x_info, + X86vfpext, sched.XMM, "{1to2}", "", f64mem>, EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4f32x_info, fpextend, + sched.YMM>, EVEX_V256; + } +} + +// Truncate Double to Float +multiclass avx512_cvtpd2ps<bits<8> opc, string OpcodeStr, X86SchedWriteWidths sched> { + let Predicates = [HasAVX512] in { + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info, fpround, sched.ZMM>, + avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8f64_info, + X86vfproundRnd, sched.ZMM>, EVEX_V512; + } + let Predicates = [HasVLX] in { + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2f64x_info, + X86vfpround, sched.XMM, "{1to2}", "{x}">, EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4f64x_info, fpround, + sched.YMM, "{1to4}", "{y}">, EVEX_V256; + + def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>; + def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, f128mem:$src), 0, "intel">; + def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>; + def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, f256mem:$src), 0, "intel">; + } +} + +defm VCVTPD2PS : avx512_cvtpd2ps<0x5A, "vcvtpd2ps", SchedWriteCvtPD2PS>, + VEX_W, PD, EVEX_CD8<64, CD8VF>; +defm VCVTPS2PD : avx512_cvtps2pd<0x5A, "vcvtps2pd", SchedWriteCvtPS2PD>, + PS, EVEX_CD8<32, CD8VH>; + +def : Pat<(v8f64 (extloadv8f32 addr:$src)), + (VCVTPS2PDZrm addr:$src)>; + +let Predicates = [HasVLX] in { + def : Pat<(X86vzmovl (v2f64 (bitconvert + (v4f32 (X86vfpround (v2f64 VR128X:$src)))))), + (VCVTPD2PSZ128rr VR128X:$src)>; + def : Pat<(X86vzmovl (v2f64 (bitconvert + (v4f32 (X86vfpround (loadv2f64 addr:$src)))))), + (VCVTPD2PSZ128rm addr:$src)>; + def : Pat<(v2f64 (extloadv2f32 addr:$src)), + (VCVTPS2PDZ128rm addr:$src)>; + def : Pat<(v4f64 (extloadv4f32 addr:$src)), + (VCVTPS2PDZ256rm addr:$src)>; +} + +// Convert Signed/Unsigned Doubleword to Double +multiclass avx512_cvtdq2pd<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNode128, X86SchedWriteWidths sched> { + // No rounding in this op + let Predicates = [HasAVX512] in + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8i32x_info, OpNode, + sched.ZMM>, EVEX_V512; + + let Predicates = [HasVLX] in { + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4i32x_info, + OpNode128, sched.XMM, "{1to2}", "", i64mem>, EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i32x_info, OpNode, + sched.YMM>, EVEX_V256; + } +} + +// Convert Signed/Unsigned Doubleword to Float +multiclass avx512_cvtdq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd, X86SchedWriteWidths sched> { + let Predicates = [HasAVX512] in + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16f32_info, v16i32_info, OpNode, + sched.ZMM>, + avx512_vcvt_fp_rc<opc, OpcodeStr, v16f32_info, v16i32_info, + OpNodeRnd, sched.ZMM>, EVEX_V512; + + let Predicates = [HasVLX] in { + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i32x_info, OpNode, + sched.XMM>, EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i32x_info, OpNode, + sched.YMM>, EVEX_V256; + } +} + +// Convert Float to Signed/Unsigned Doubleword with truncation +multiclass avx512_cvttps2dq<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd, X86SchedWriteWidths sched> { + let Predicates = [HasAVX512] in { + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode, + sched.ZMM>, + avx512_vcvt_fp_sae<opc, OpcodeStr, v16i32_info, v16f32_info, + OpNodeRnd, sched.ZMM>, EVEX_V512; + } + let Predicates = [HasVLX] in { + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode, + sched.XMM>, EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode, + sched.YMM>, EVEX_V256; + } +} + +// Convert Float to Signed/Unsigned Doubleword +multiclass avx512_cvtps2dq<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd, X86SchedWriteWidths sched> { + let Predicates = [HasAVX512] in { + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode, + sched.ZMM>, + avx512_vcvt_fp_rc<opc, OpcodeStr, v16i32_info, v16f32_info, + OpNodeRnd, sched.ZMM>, EVEX_V512; + } + let Predicates = [HasVLX] in { + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode, + sched.XMM>, EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode, + sched.YMM>, EVEX_V256; + } +} + +// Convert Double to Signed/Unsigned Doubleword with truncation +multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd, X86SchedWriteWidths sched> { + let Predicates = [HasAVX512] in { + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode, + sched.ZMM>, + avx512_vcvt_fp_sae<opc, OpcodeStr, v8i32x_info, v8f64_info, + OpNodeRnd, sched.ZMM>, EVEX_V512; + } + let Predicates = [HasVLX] in { + // we need "x"/"y" suffixes in order to distinguish between 128 and 256 + // memory forms of these instructions in Asm Parser. They have the same + // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly + // due to the same reason. + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info, + OpNode, sched.XMM, "{1to2}", "{x}">, EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode, + sched.YMM, "{1to4}", "{y}">, EVEX_V256; + + def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>; + def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, i128mem:$src), 0, "intel">; + def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>; + def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, i256mem:$src), 0, "intel">; + } +} + +// Convert Double to Signed/Unsigned Doubleword +multiclass avx512_cvtpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd, X86SchedWriteWidths sched> { + let Predicates = [HasAVX512] in { + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode, + sched.ZMM>, + avx512_vcvt_fp_rc<opc, OpcodeStr, v8i32x_info, v8f64_info, + OpNodeRnd, sched.ZMM>, EVEX_V512; + } + let Predicates = [HasVLX] in { + // we need "x"/"y" suffixes in order to distinguish between 128 and 256 + // memory forms of these instructions in Asm Parcer. They have the same + // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly + // due to the same reason. + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info, OpNode, + sched.XMM, "{1to2}", "{x}">, EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode, + sched.YMM, "{1to4}", "{y}">, EVEX_V256; + + def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>; + def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, f128mem:$src), 0, "intel">; + def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>; + def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, f256mem:$src), 0, "intel">; + } +} + +// Convert Double to Signed/Unsigned Quardword +multiclass avx512_cvtpd2qq<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd, X86SchedWriteWidths sched> { + let Predicates = [HasDQI] in { + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode, + sched.ZMM>, + avx512_vcvt_fp_rc<opc, OpcodeStr, v8i64_info, v8f64_info, + OpNodeRnd, sched.ZMM>, EVEX_V512; + } + let Predicates = [HasDQI, HasVLX] in { + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode, + sched.XMM>, EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode, + sched.YMM>, EVEX_V256; + } +} + +// Convert Double to Signed/Unsigned Quardword with truncation +multiclass avx512_cvttpd2qq<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd, X86SchedWriteWidths sched> { + let Predicates = [HasDQI] in { + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode, + sched.ZMM>, + avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f64_info, + OpNodeRnd, sched.ZMM>, EVEX_V512; + } + let Predicates = [HasDQI, HasVLX] in { + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode, + sched.XMM>, EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode, + sched.YMM>, EVEX_V256; + } +} + +// Convert Signed/Unsigned Quardword to Double +multiclass avx512_cvtqq2pd<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd, X86SchedWriteWidths sched> { + let Predicates = [HasDQI] in { + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8i64_info, OpNode, + sched.ZMM>, + avx512_vcvt_fp_rc<opc, OpcodeStr, v8f64_info, v8i64_info, + OpNodeRnd, sched.ZMM>, EVEX_V512; + } + let Predicates = [HasDQI, HasVLX] in { + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v2i64x_info, OpNode, + sched.XMM>, EVEX_V128, NotEVEX2VEXConvertible; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i64x_info, OpNode, + sched.YMM>, EVEX_V256, NotEVEX2VEXConvertible; + } +} + +// Convert Float to Signed/Unsigned Quardword +multiclass avx512_cvtps2qq<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd, X86SchedWriteWidths sched> { + let Predicates = [HasDQI] in { + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode, + sched.ZMM>, + avx512_vcvt_fp_rc<opc, OpcodeStr, v8i64_info, v8f32x_info, + OpNodeRnd, sched.ZMM>, EVEX_V512; + } + let Predicates = [HasDQI, HasVLX] in { + // Explicitly specified broadcast string, since we take only 2 elements + // from v4f32x_info source + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode, + sched.XMM, "{1to2}", "", f64mem>, EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode, + sched.YMM>, EVEX_V256; + } +} + +// Convert Float to Signed/Unsigned Quardword with truncation +multiclass avx512_cvttps2qq<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd, X86SchedWriteWidths sched> { + let Predicates = [HasDQI] in { + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode, sched.ZMM>, + avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f32x_info, + OpNodeRnd, sched.ZMM>, EVEX_V512; + } + let Predicates = [HasDQI, HasVLX] in { + // Explicitly specified broadcast string, since we take only 2 elements + // from v4f32x_info source + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode, + sched.XMM, "{1to2}", "", f64mem>, EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode, + sched.YMM>, EVEX_V256; + } +} + +// Convert Signed/Unsigned Quardword to Float +multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNode128, SDNode OpNodeRnd, + X86SchedWriteWidths sched> { + let Predicates = [HasDQI] in { + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i64_info, OpNode, + sched.ZMM>, + avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8i64_info, + OpNodeRnd, sched.ZMM>, EVEX_V512; + } + let Predicates = [HasDQI, HasVLX] in { + // we need "x"/"y" suffixes in order to distinguish between 128 and 256 + // memory forms of these instructions in Asm Parcer. They have the same + // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly + // due to the same reason. + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2i64x_info, OpNode128, + sched.XMM, "{1to2}", "{x}">, EVEX_V128, + NotEVEX2VEXConvertible; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i64x_info, OpNode, + sched.YMM, "{1to4}", "{y}">, EVEX_V256, + NotEVEX2VEXConvertible; + + def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>; + def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst, i128mem:$src), 0, "intel">; + def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0>; + def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}", + (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst, i256mem:$src), 0, "intel">; + } +} + +defm VCVTDQ2PD : avx512_cvtdq2pd<0xE6, "vcvtdq2pd", sint_to_fp, X86VSintToFP, + SchedWriteCvtDQ2PD>, XS, EVEX_CD8<32, CD8VH>; + +defm VCVTDQ2PS : avx512_cvtdq2ps<0x5B, "vcvtdq2ps", sint_to_fp, + X86VSintToFpRnd, SchedWriteCvtDQ2PS>, + PS, EVEX_CD8<32, CD8VF>; + +defm VCVTTPS2DQ : avx512_cvttps2dq<0x5B, "vcvttps2dq", X86cvttp2si, + X86cvttp2siRnd, SchedWriteCvtPS2DQ>, + XS, EVEX_CD8<32, CD8VF>; + +defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", X86cvttp2si, + X86cvttp2siRnd, SchedWriteCvtPD2DQ>, + PD, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VCVTTPS2UDQ : avx512_cvttps2dq<0x78, "vcvttps2udq", X86cvttp2ui, + X86cvttp2uiRnd, SchedWriteCvtPS2DQ>, PS, + EVEX_CD8<32, CD8VF>; + +defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", X86cvttp2ui, + X86cvttp2uiRnd, SchedWriteCvtPD2DQ>, + PS, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VCVTUDQ2PD : avx512_cvtdq2pd<0x7A, "vcvtudq2pd", uint_to_fp, + X86VUintToFP, SchedWriteCvtDQ2PD>, XS, + EVEX_CD8<32, CD8VH>; + +defm VCVTUDQ2PS : avx512_cvtdq2ps<0x7A, "vcvtudq2ps", uint_to_fp, + X86VUintToFpRnd, SchedWriteCvtDQ2PS>, XD, + EVEX_CD8<32, CD8VF>; + +defm VCVTPS2DQ : avx512_cvtps2dq<0x5B, "vcvtps2dq", X86cvtp2Int, + X86cvtp2IntRnd, SchedWriteCvtPS2DQ>, PD, + EVEX_CD8<32, CD8VF>; + +defm VCVTPD2DQ : avx512_cvtpd2dq<0xE6, "vcvtpd2dq", X86cvtp2Int, + X86cvtp2IntRnd, SchedWriteCvtPD2DQ>, XD, + VEX_W, EVEX_CD8<64, CD8VF>; + +defm VCVTPS2UDQ : avx512_cvtps2dq<0x79, "vcvtps2udq", X86cvtp2UInt, + X86cvtp2UIntRnd, SchedWriteCvtPS2DQ>, + PS, EVEX_CD8<32, CD8VF>; + +defm VCVTPD2UDQ : avx512_cvtpd2dq<0x79, "vcvtpd2udq", X86cvtp2UInt, + X86cvtp2UIntRnd, SchedWriteCvtPD2DQ>, VEX_W, + PS, EVEX_CD8<64, CD8VF>; + +defm VCVTPD2QQ : avx512_cvtpd2qq<0x7B, "vcvtpd2qq", X86cvtp2Int, + X86cvtp2IntRnd, SchedWriteCvtPD2DQ>, VEX_W, + PD, EVEX_CD8<64, CD8VF>; + +defm VCVTPS2QQ : avx512_cvtps2qq<0x7B, "vcvtps2qq", X86cvtp2Int, + X86cvtp2IntRnd, SchedWriteCvtPS2DQ>, PD, + EVEX_CD8<32, CD8VH>; + +defm VCVTPD2UQQ : avx512_cvtpd2qq<0x79, "vcvtpd2uqq", X86cvtp2UInt, + X86cvtp2UIntRnd, SchedWriteCvtPD2DQ>, VEX_W, + PD, EVEX_CD8<64, CD8VF>; + +defm VCVTPS2UQQ : avx512_cvtps2qq<0x79, "vcvtps2uqq", X86cvtp2UInt, + X86cvtp2UIntRnd, SchedWriteCvtPS2DQ>, PD, + EVEX_CD8<32, CD8VH>; + +defm VCVTTPD2QQ : avx512_cvttpd2qq<0x7A, "vcvttpd2qq", X86cvttp2si, + X86cvttp2siRnd, SchedWriteCvtPD2DQ>, VEX_W, + PD, EVEX_CD8<64, CD8VF>; + +defm VCVTTPS2QQ : avx512_cvttps2qq<0x7A, "vcvttps2qq", X86cvttp2si, + X86cvttp2siRnd, SchedWriteCvtPS2DQ>, PD, + EVEX_CD8<32, CD8VH>; + +defm VCVTTPD2UQQ : avx512_cvttpd2qq<0x78, "vcvttpd2uqq", X86cvttp2ui, + X86cvttp2uiRnd, SchedWriteCvtPD2DQ>, VEX_W, + PD, EVEX_CD8<64, CD8VF>; + +defm VCVTTPS2UQQ : avx512_cvttps2qq<0x78, "vcvttps2uqq", X86cvttp2ui, + X86cvttp2uiRnd, SchedWriteCvtPS2DQ>, PD, + EVEX_CD8<32, CD8VH>; + +defm VCVTQQ2PD : avx512_cvtqq2pd<0xE6, "vcvtqq2pd", sint_to_fp, + X86VSintToFpRnd, SchedWriteCvtDQ2PD>, VEX_W, XS, + EVEX_CD8<64, CD8VF>; + +defm VCVTUQQ2PD : avx512_cvtqq2pd<0x7A, "vcvtuqq2pd", uint_to_fp, + X86VUintToFpRnd, SchedWriteCvtDQ2PD>, VEX_W, XS, + EVEX_CD8<64, CD8VF>; + +defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", sint_to_fp, X86VSintToFP, + X86VSintToFpRnd, SchedWriteCvtDQ2PS>, VEX_W, PS, + EVEX_CD8<64, CD8VF>; + +defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", uint_to_fp, X86VUintToFP, + X86VUintToFpRnd, SchedWriteCvtDQ2PS>, VEX_W, XD, + EVEX_CD8<64, CD8VF>; + +let Predicates = [HasAVX512] in { + def : Pat<(v16i32 (fp_to_sint (v16f32 VR512:$src))), + (VCVTTPS2DQZrr VR512:$src)>; + def : Pat<(v16i32 (fp_to_sint (loadv16f32 addr:$src))), + (VCVTTPS2DQZrm addr:$src)>; + + def : Pat<(v16i32 (fp_to_uint (v16f32 VR512:$src))), + (VCVTTPS2UDQZrr VR512:$src)>; + def : Pat<(v16i32 (fp_to_uint (loadv16f32 addr:$src))), + (VCVTTPS2UDQZrm addr:$src)>; + + def : Pat<(v8i32 (fp_to_sint (v8f64 VR512:$src))), + (VCVTTPD2DQZrr VR512:$src)>; + def : Pat<(v8i32 (fp_to_sint (loadv8f64 addr:$src))), + (VCVTTPD2DQZrm addr:$src)>; + + def : Pat<(v8i32 (fp_to_uint (v8f64 VR512:$src))), + (VCVTTPD2UDQZrr VR512:$src)>; + def : Pat<(v8i32 (fp_to_uint (loadv8f64 addr:$src))), + (VCVTTPD2UDQZrm addr:$src)>; +} + +let Predicates = [HasVLX] in { + def : Pat<(v4i32 (fp_to_sint (v4f32 VR128X:$src))), + (VCVTTPS2DQZ128rr VR128X:$src)>; + def : Pat<(v4i32 (fp_to_sint (loadv4f32 addr:$src))), + (VCVTTPS2DQZ128rm addr:$src)>; + + def : Pat<(v4i32 (fp_to_uint (v4f32 VR128X:$src))), + (VCVTTPS2UDQZ128rr VR128X:$src)>; + def : Pat<(v4i32 (fp_to_uint (loadv4f32 addr:$src))), + (VCVTTPS2UDQZ128rm addr:$src)>; + + def : Pat<(v8i32 (fp_to_sint (v8f32 VR256X:$src))), + (VCVTTPS2DQZ256rr VR256X:$src)>; + def : Pat<(v8i32 (fp_to_sint (loadv8f32 addr:$src))), + (VCVTTPS2DQZ256rm addr:$src)>; + + def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src))), + (VCVTTPS2UDQZ256rr VR256X:$src)>; + def : Pat<(v8i32 (fp_to_uint (loadv8f32 addr:$src))), + (VCVTTPS2UDQZ256rm addr:$src)>; + + def : Pat<(v4i32 (fp_to_sint (v4f64 VR256X:$src))), + (VCVTTPD2DQZ256rr VR256X:$src)>; + def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))), + (VCVTTPD2DQZ256rm addr:$src)>; + + def : Pat<(v4i32 (fp_to_uint (v4f64 VR256X:$src))), + (VCVTTPD2UDQZ256rr VR256X:$src)>; + def : Pat<(v4i32 (fp_to_uint (loadv4f64 addr:$src))), + (VCVTTPD2UDQZ256rm addr:$src)>; +} + +let Predicates = [HasDQI] in { + def : Pat<(v8i64 (fp_to_sint (v8f32 VR256X:$src))), + (VCVTTPS2QQZrr VR256X:$src)>; + def : Pat<(v8i64 (fp_to_sint (loadv8f32 addr:$src))), + (VCVTTPS2QQZrm addr:$src)>; + + def : Pat<(v8i64 (fp_to_uint (v8f32 VR256X:$src))), + (VCVTTPS2UQQZrr VR256X:$src)>; + def : Pat<(v8i64 (fp_to_uint (loadv8f32 addr:$src))), + (VCVTTPS2UQQZrm addr:$src)>; + + def : Pat<(v8i64 (fp_to_sint (v8f64 VR512:$src))), + (VCVTTPD2QQZrr VR512:$src)>; + def : Pat<(v8i64 (fp_to_sint (loadv8f64 addr:$src))), + (VCVTTPD2QQZrm addr:$src)>; + + def : Pat<(v8i64 (fp_to_uint (v8f64 VR512:$src))), + (VCVTTPD2UQQZrr VR512:$src)>; + def : Pat<(v8i64 (fp_to_uint (loadv8f64 addr:$src))), + (VCVTTPD2UQQZrm addr:$src)>; +} + +let Predicates = [HasDQI, HasVLX] in { + def : Pat<(v4i64 (fp_to_sint (v4f32 VR128X:$src))), + (VCVTTPS2QQZ256rr VR128X:$src)>; + def : Pat<(v4i64 (fp_to_sint (loadv4f32 addr:$src))), + (VCVTTPS2QQZ256rm addr:$src)>; + + def : Pat<(v4i64 (fp_to_uint (v4f32 VR128X:$src))), + (VCVTTPS2UQQZ256rr VR128X:$src)>; + def : Pat<(v4i64 (fp_to_uint (loadv4f32 addr:$src))), + (VCVTTPS2UQQZ256rm addr:$src)>; + + def : Pat<(v2i64 (fp_to_sint (v2f64 VR128X:$src))), + (VCVTTPD2QQZ128rr VR128X:$src)>; + def : Pat<(v2i64 (fp_to_sint (loadv2f64 addr:$src))), + (VCVTTPD2QQZ128rm addr:$src)>; + + def : Pat<(v2i64 (fp_to_uint (v2f64 VR128X:$src))), + (VCVTTPD2UQQZ128rr VR128X:$src)>; + def : Pat<(v2i64 (fp_to_uint (loadv2f64 addr:$src))), + (VCVTTPD2UQQZ128rm addr:$src)>; + + def : Pat<(v4i64 (fp_to_sint (v4f64 VR256X:$src))), + (VCVTTPD2QQZ256rr VR256X:$src)>; + def : Pat<(v4i64 (fp_to_sint (loadv4f64 addr:$src))), + (VCVTTPD2QQZ256rm addr:$src)>; + + def : Pat<(v4i64 (fp_to_uint (v4f64 VR256X:$src))), + (VCVTTPD2UQQZ256rr VR256X:$src)>; + def : Pat<(v4i64 (fp_to_uint (loadv4f64 addr:$src))), + (VCVTTPD2UQQZ256rm addr:$src)>; +} + +let Predicates = [HasAVX512, NoVLX] in { +def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src1))), + (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr + (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), + VR256X:$src1, sub_ymm)))), sub_ymm)>; + +def : Pat<(v4i32 (fp_to_uint (v4f32 VR128X:$src1))), + (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr + (v16f32 (INSERT_SUBREG (IMPLICIT_DEF), + VR128X:$src1, sub_xmm)))), sub_xmm)>; + +def : Pat<(v4i32 (fp_to_uint (v4f64 VR256X:$src1))), + (EXTRACT_SUBREG (v8i32 (VCVTTPD2UDQZrr + (v8f64 (INSERT_SUBREG (IMPLICIT_DEF), + VR256X:$src1, sub_ymm)))), sub_xmm)>; + +def : Pat<(v8f32 (uint_to_fp (v8i32 VR256X:$src1))), + (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), + VR256X:$src1, sub_ymm)))), sub_ymm)>; + +def : Pat<(v4f32 (uint_to_fp (v4i32 VR128X:$src1))), + (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), + VR128X:$src1, sub_xmm)))), sub_xmm)>; + +def : Pat<(v4f64 (uint_to_fp (v4i32 VR128X:$src1))), + (EXTRACT_SUBREG (v8f64 (VCVTUDQ2PDZrr + (v8i32 (INSERT_SUBREG (IMPLICIT_DEF), + VR128X:$src1, sub_xmm)))), sub_ymm)>; + +def : Pat<(v2f64 (X86VUintToFP (v4i32 VR128X:$src1))), + (EXTRACT_SUBREG (v8f64 (VCVTUDQ2PDZrr + (v8i32 (INSERT_SUBREG (IMPLICIT_DEF), + VR128X:$src1, sub_xmm)))), sub_xmm)>; +} + +let Predicates = [HasAVX512, HasVLX] in { + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvtp2Int (v2f64 VR128X:$src)))))), + (VCVTPD2DQZ128rr VR128X:$src)>; + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvtp2Int (loadv2f64 addr:$src)))))), + (VCVTPD2DQZ128rm addr:$src)>; + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvtp2UInt (v2f64 VR128X:$src)))))), + (VCVTPD2UDQZ128rr VR128X:$src)>; + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvttp2si (v2f64 VR128X:$src)))))), + (VCVTTPD2DQZ128rr VR128X:$src)>; + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvttp2si (loadv2f64 addr:$src)))))), + (VCVTTPD2DQZ128rm addr:$src)>; + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvttp2ui (v2f64 VR128X:$src)))))), + (VCVTTPD2UDQZ128rr VR128X:$src)>; + + def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (VCVTDQ2PDZ128rm addr:$src)>; + def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))), + (VCVTDQ2PDZ128rm addr:$src)>; + + def : Pat<(v2f64 (X86VUintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (VCVTUDQ2PDZ128rm addr:$src)>; + def : Pat<(v2f64 (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))), + (VCVTUDQ2PDZ128rm addr:$src)>; +} + +let Predicates = [HasAVX512] in { + def : Pat<(v8f32 (fpround (loadv8f64 addr:$src))), + (VCVTPD2PSZrm addr:$src)>; + def : Pat<(v8f64 (extloadv8f32 addr:$src)), + (VCVTPS2PDZrm addr:$src)>; +} + +let Predicates = [HasDQI, HasVLX] in { + def : Pat<(X86vzmovl (v2f64 (bitconvert + (v4f32 (X86VSintToFP (v2i64 VR128X:$src)))))), + (VCVTQQ2PSZ128rr VR128X:$src)>; + def : Pat<(X86vzmovl (v2f64 (bitconvert + (v4f32 (X86VUintToFP (v2i64 VR128X:$src)))))), + (VCVTUQQ2PSZ128rr VR128X:$src)>; +} + +let Predicates = [HasDQI, NoVLX] in { +def : Pat<(v2i64 (fp_to_sint (v2f64 VR128X:$src1))), + (EXTRACT_SUBREG (v8i64 (VCVTTPD2QQZrr + (v8f64 (INSERT_SUBREG (IMPLICIT_DEF), + VR128X:$src1, sub_xmm)))), sub_xmm)>; + +def : Pat<(v4i64 (fp_to_sint (v4f32 VR128X:$src1))), + (EXTRACT_SUBREG (v8i64 (VCVTTPS2QQZrr + (v8f32 (INSERT_SUBREG (IMPLICIT_DEF), + VR128X:$src1, sub_xmm)))), sub_ymm)>; + +def : Pat<(v4i64 (fp_to_sint (v4f64 VR256X:$src1))), + (EXTRACT_SUBREG (v8i64 (VCVTTPD2QQZrr + (v8f64 (INSERT_SUBREG (IMPLICIT_DEF), + VR256X:$src1, sub_ymm)))), sub_ymm)>; + +def : Pat<(v2i64 (fp_to_uint (v2f64 VR128X:$src1))), + (EXTRACT_SUBREG (v8i64 (VCVTTPD2UQQZrr + (v8f64 (INSERT_SUBREG (IMPLICIT_DEF), + VR128X:$src1, sub_xmm)))), sub_xmm)>; + +def : Pat<(v4i64 (fp_to_uint (v4f32 VR128X:$src1))), + (EXTRACT_SUBREG (v8i64 (VCVTTPS2UQQZrr + (v8f32 (INSERT_SUBREG (IMPLICIT_DEF), + VR128X:$src1, sub_xmm)))), sub_ymm)>; + +def : Pat<(v4i64 (fp_to_uint (v4f64 VR256X:$src1))), + (EXTRACT_SUBREG (v8i64 (VCVTTPD2UQQZrr + (v8f64 (INSERT_SUBREG (IMPLICIT_DEF), + VR256X:$src1, sub_ymm)))), sub_ymm)>; + +def : Pat<(v4f32 (sint_to_fp (v4i64 VR256X:$src1))), + (EXTRACT_SUBREG (v8f32 (VCVTQQ2PSZrr + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), + VR256X:$src1, sub_ymm)))), sub_xmm)>; + +def : Pat<(v2f64 (sint_to_fp (v2i64 VR128X:$src1))), + (EXTRACT_SUBREG (v8f64 (VCVTQQ2PDZrr + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), + VR128X:$src1, sub_xmm)))), sub_xmm)>; + +def : Pat<(v4f64 (sint_to_fp (v4i64 VR256X:$src1))), + (EXTRACT_SUBREG (v8f64 (VCVTQQ2PDZrr + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), + VR256X:$src1, sub_ymm)))), sub_ymm)>; + +def : Pat<(v4f32 (uint_to_fp (v4i64 VR256X:$src1))), + (EXTRACT_SUBREG (v8f32 (VCVTUQQ2PSZrr + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), + VR256X:$src1, sub_ymm)))), sub_xmm)>; + +def : Pat<(v2f64 (uint_to_fp (v2i64 VR128X:$src1))), + (EXTRACT_SUBREG (v8f64 (VCVTUQQ2PDZrr + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), + VR128X:$src1, sub_xmm)))), sub_xmm)>; + +def : Pat<(v4f64 (uint_to_fp (v4i64 VR256X:$src1))), + (EXTRACT_SUBREG (v8f64 (VCVTUQQ2PDZrr + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), + VR256X:$src1, sub_ymm)))), sub_ymm)>; +} + +//===----------------------------------------------------------------------===// +// Half precision conversion instructions +//===----------------------------------------------------------------------===// + +multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src, + X86MemOperand x86memop, PatFrag ld_frag, + X86FoldableSchedWrite sched> { + defm rr : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), + (ins _src.RC:$src), "vcvtph2ps", "$src", "$src", + (X86cvtph2ps (_src.VT _src.RC:$src))>, + T8PD, Sched<[sched]>; + defm rm : AVX512_maskable<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst), + (ins x86memop:$src), "vcvtph2ps", "$src", "$src", + (X86cvtph2ps (_src.VT + (bitconvert + (ld_frag addr:$src))))>, + T8PD, Sched<[sched.Folded]>; +} + +multiclass avx512_cvtph2ps_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src, + X86FoldableSchedWrite sched> { + defm rrb : AVX512_maskable<0x13, MRMSrcReg, _dest, (outs _dest.RC:$dst), + (ins _src.RC:$src), "vcvtph2ps", + "{sae}, $src", "$src, {sae}", + (X86cvtph2psRnd (_src.VT _src.RC:$src), + (i32 FROUND_NO_EXC))>, + T8PD, EVEX_B, Sched<[sched]>; +} + +let Predicates = [HasAVX512] in + defm VCVTPH2PSZ : avx512_cvtph2ps<v16f32_info, v16i16x_info, f256mem, loadv4i64, + WriteCvtPH2PSZ>, + avx512_cvtph2ps_sae<v16f32_info, v16i16x_info, WriteCvtPH2PSZ>, + EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>; + +let Predicates = [HasVLX] in { + defm VCVTPH2PSZ256 : avx512_cvtph2ps<v8f32x_info, v8i16x_info, f128mem, + loadv2i64, WriteCvtPH2PSY>, EVEX, EVEX_V256, + EVEX_CD8<32, CD8VH>; + defm VCVTPH2PSZ128 : avx512_cvtph2ps<v4f32x_info, v8i16x_info, f64mem, + loadv2i64, WriteCvtPH2PS>, EVEX, EVEX_V128, + EVEX_CD8<32, CD8VH>; + + // Pattern match vcvtph2ps of a scalar i64 load. + def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzmovl_v2i64 addr:$src)))), + (VCVTPH2PSZ128rm addr:$src)>; + def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzload_v2i64 addr:$src)))), + (VCVTPH2PSZ128rm addr:$src)>; + def : Pat<(v4f32 (X86cvtph2ps (v8i16 (bitconvert + (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), + (VCVTPH2PSZ128rm addr:$src)>; +} + +multiclass avx512_cvtps2ph<X86VectorVTInfo _dest, X86VectorVTInfo _src, + X86MemOperand x86memop, SchedWrite RR, SchedWrite MR> { + defm rr : AVX512_maskable<0x1D, MRMDestReg, _dest ,(outs _dest.RC:$dst), + (ins _src.RC:$src1, i32u8imm:$src2), + "vcvtps2ph", "$src2, $src1", "$src1, $src2", + (X86cvtps2ph (_src.VT _src.RC:$src1), + (i32 imm:$src2)), 0, 0>, + AVX512AIi8Base, Sched<[RR]>; + let hasSideEffects = 0, mayStore = 1 in { + def mr : AVX512AIi8<0x1D, MRMDestMem, (outs), + (ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2), + "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, + Sched<[MR]>; + def mrk : AVX512AIi8<0x1D, MRMDestMem, (outs), + (ins x86memop:$dst, _dest.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2), + "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", []>, + EVEX_K, Sched<[MR]>, NotMemoryFoldable; + } +} + +multiclass avx512_cvtps2ph_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src, + SchedWrite Sched> { + let hasSideEffects = 0 in + defm rrb : AVX512_maskable_in_asm<0x1D, MRMDestReg, _dest, + (outs _dest.RC:$dst), + (ins _src.RC:$src1, i32u8imm:$src2), + "vcvtps2ph", "$src2, {sae}, $src1", "$src1, {sae}, $src2", []>, + EVEX_B, AVX512AIi8Base, Sched<[Sched]>; +} + +let Predicates = [HasAVX512] in { + defm VCVTPS2PHZ : avx512_cvtps2ph<v16i16x_info, v16f32_info, f256mem, + WriteCvtPS2PHZ, WriteCvtPS2PHZSt>, + avx512_cvtps2ph_sae<v16i16x_info, v16f32_info, WriteCvtPS2PHZ>, + EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>; + let Predicates = [HasVLX] in { + defm VCVTPS2PHZ256 : avx512_cvtps2ph<v8i16x_info, v8f32x_info, f128mem, + WriteCvtPS2PHY, WriteCvtPS2PHYSt>, + EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>; + defm VCVTPS2PHZ128 : avx512_cvtps2ph<v8i16x_info, v4f32x_info, f64mem, + WriteCvtPS2PH, WriteCvtPS2PHSt>, + EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>; + } + + def : Pat<(store (f64 (extractelt + (bc_v2f64 (v8i16 (X86cvtps2ph VR128X:$src1, i32:$src2))), + (iPTR 0))), addr:$dst), + (VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, imm:$src2)>; + def : Pat<(store (i64 (extractelt + (bc_v2i64 (v8i16 (X86cvtps2ph VR128X:$src1, i32:$src2))), + (iPTR 0))), addr:$dst), + (VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, imm:$src2)>; + def : Pat<(store (v8i16 (X86cvtps2ph VR256X:$src1, i32:$src2)), addr:$dst), + (VCVTPS2PHZ256mr addr:$dst, VR256X:$src1, imm:$src2)>; + def : Pat<(store (v16i16 (X86cvtps2ph VR512:$src1, i32:$src2)), addr:$dst), + (VCVTPS2PHZmr addr:$dst, VR512:$src1, imm:$src2)>; +} + +// Patterns for matching conversions from float to half-float and vice versa. +let Predicates = [HasVLX] in { + // Use MXCSR.RC for rounding instead of explicitly specifying the default + // rounding mode (Nearest-Even, encoded as 0). Both are equivalent in the + // configurations we support (the default). However, falling back to MXCSR is + // more consistent with other instructions, which are always controlled by it. + // It's encoded as 0b100. + def : Pat<(fp_to_f16 FR32X:$src), + (i16 (EXTRACT_SUBREG (VMOVPDI2DIZrr (v8i16 (VCVTPS2PHZ128rr + (v4f32 (COPY_TO_REGCLASS FR32X:$src, VR128X)), 4))), sub_16bit))>; + + def : Pat<(f16_to_fp GR16:$src), + (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSZ128rr + (v8i16 (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128X)))), FR32X)) >; + + def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32X:$src))), + (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSZ128rr + (v8i16 (VCVTPS2PHZ128rr + (v4f32 (COPY_TO_REGCLASS FR32X:$src, VR128X)), 4)))), FR32X)) >; +} + +// Unordered/Ordered scalar fp compare with Sea and set EFLAGS +multiclass avx512_ord_cmp_sae<bits<8> opc, X86VectorVTInfo _, + string OpcodeStr, X86FoldableSchedWrite sched> { + let hasSideEffects = 0 in + def rrb: AVX512<opc, MRMSrcReg, (outs), (ins _.RC:$src1, _.RC:$src2), + !strconcat(OpcodeStr, "\t{{sae}, $src2, $src1|$src1, $src2, {sae}}"), []>, + EVEX, EVEX_B, VEX_LIG, EVEX_V128, Sched<[sched]>; +} + +let Defs = [EFLAGS], Predicates = [HasAVX512] in { + defm VUCOMISSZ : avx512_ord_cmp_sae<0x2E, v4f32x_info, "vucomiss", WriteFCom>, + AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>; + defm VUCOMISDZ : avx512_ord_cmp_sae<0x2E, v2f64x_info, "vucomisd", WriteFCom>, + AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>; + defm VCOMISSZ : avx512_ord_cmp_sae<0x2F, v4f32x_info, "vcomiss", WriteFCom>, + AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>; + defm VCOMISDZ : avx512_ord_cmp_sae<0x2F, v2f64x_info, "vcomisd", WriteFCom>, + AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>; +} + +let Defs = [EFLAGS], Predicates = [HasAVX512] in { + defm VUCOMISSZ : sse12_ord_cmp<0x2E, FR32X, X86cmp, f32, f32mem, loadf32, + "ucomiss", WriteFCom>, PS, EVEX, VEX_LIG, + EVEX_CD8<32, CD8VT1>; + defm VUCOMISDZ : sse12_ord_cmp<0x2E, FR64X, X86cmp, f64, f64mem, loadf64, + "ucomisd", WriteFCom>, PD, EVEX, + VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; + let Pattern = []<dag> in { + defm VCOMISSZ : sse12_ord_cmp<0x2F, FR32X, undef, f32, f32mem, loadf32, + "comiss", WriteFCom>, PS, EVEX, VEX_LIG, + EVEX_CD8<32, CD8VT1>; + defm VCOMISDZ : sse12_ord_cmp<0x2F, FR64X, undef, f64, f64mem, loadf64, + "comisd", WriteFCom>, PD, EVEX, + VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; + } + let isCodeGenOnly = 1 in { + defm VUCOMISSZ : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v4f32, ssmem, + sse_load_f32, "ucomiss", WriteFCom>, PS, EVEX, VEX_LIG, + EVEX_CD8<32, CD8VT1>; + defm VUCOMISDZ : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v2f64, sdmem, + sse_load_f64, "ucomisd", WriteFCom>, PD, EVEX, + VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; + + defm VCOMISSZ : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v4f32, ssmem, + sse_load_f32, "comiss", WriteFCom>, PS, EVEX, VEX_LIG, + EVEX_CD8<32, CD8VT1>; + defm VCOMISDZ : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v2f64, sdmem, + sse_load_f64, "comisd", WriteFCom>, PD, EVEX, + VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; + } +} + +/// avx512_fp14_s rcp14ss, rcp14sd, rsqrt14ss, rsqrt14sd +multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, X86VectorVTInfo _> { + let Predicates = [HasAVX512], ExeDomain = _.ExeDomain in { + defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>, + EVEX_4V, Sched<[sched]>; + defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (OpNode (_.VT _.RC:$src1), + _.ScalarIntMemCPat:$src2)>, EVEX_4V, + Sched<[sched.Folded, ReadAfterLd]>; +} +} + +defm VRCP14SSZ : avx512_fp14_s<0x4D, "vrcp14ss", X86rcp14s, SchedWriteFRcp.Scl, + f32x_info>, EVEX_CD8<32, CD8VT1>, + T8PD; +defm VRCP14SDZ : avx512_fp14_s<0x4D, "vrcp14sd", X86rcp14s, SchedWriteFRcp.Scl, + f64x_info>, VEX_W, EVEX_CD8<64, CD8VT1>, + T8PD; +defm VRSQRT14SSZ : avx512_fp14_s<0x4F, "vrsqrt14ss", X86rsqrt14s, + SchedWriteFRsqrt.Scl, f32x_info>, + EVEX_CD8<32, CD8VT1>, T8PD; +defm VRSQRT14SDZ : avx512_fp14_s<0x4F, "vrsqrt14sd", X86rsqrt14s, + SchedWriteFRsqrt.Scl, f64x_info>, VEX_W, + EVEX_CD8<64, CD8VT1>, T8PD; + +/// avx512_fp14_p rcp14ps, rcp14pd, rsqrt14ps, rsqrt14pd +multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in { + defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src), OpcodeStr, "$src", "$src", + (_.VT (OpNode _.RC:$src))>, EVEX, T8PD, + Sched<[sched]>; + defm m: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.MemOp:$src), OpcodeStr, "$src", "$src", + (OpNode (_.VT + (bitconvert (_.LdFrag addr:$src))))>, EVEX, T8PD, + Sched<[sched.Folded, ReadAfterLd]>; + defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.ScalarMemOp:$src), OpcodeStr, + "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr, + (OpNode (_.VT + (X86VBroadcast (_.ScalarLdFrag addr:$src))))>, + EVEX, T8PD, EVEX_B, Sched<[sched.Folded, ReadAfterLd]>; + } +} + +multiclass avx512_fp14_p_vl_all<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86SchedWriteWidths sched> { + defm PSZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"), OpNode, sched.ZMM, + v16f32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>; + defm PDZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"), OpNode, sched.ZMM, + v8f64_info>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + + // Define only if AVX512VL feature is present. + let Predicates = [HasVLX] in { + defm PSZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"), + OpNode, sched.XMM, v4f32x_info>, + EVEX_V128, EVEX_CD8<32, CD8VF>; + defm PSZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"), + OpNode, sched.YMM, v8f32x_info>, + EVEX_V256, EVEX_CD8<32, CD8VF>; + defm PDZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"), + OpNode, sched.XMM, v2f64x_info>, + EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>; + defm PDZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"), + OpNode, sched.YMM, v4f64x_info>, + EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>; + } +} + +defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86rsqrt14, SchedWriteFRsqrt>; +defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86rcp14, SchedWriteFRcp>; + +/// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd +multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, + SDNode OpNode, X86FoldableSchedWrite sched> { + let ExeDomain = _.ExeDomain in { + defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), + (i32 FROUND_CURRENT))>, + Sched<[sched]>; + + defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, + "{sae}, $src2, $src1", "$src1, $src2, {sae}", + (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), + (i32 FROUND_NO_EXC))>, EVEX_B, + Sched<[sched]>; + + defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2, + (i32 FROUND_CURRENT))>, + Sched<[sched.Folded, ReadAfterLd]>; + } +} + +multiclass avx512_eri_s<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched> { + defm SSZ : avx512_fp28_s<opc, OpcodeStr#"ss", f32x_info, OpNode, sched>, + EVEX_CD8<32, CD8VT1>; + defm SDZ : avx512_fp28_s<opc, OpcodeStr#"sd", f64x_info, OpNode, sched>, + EVEX_CD8<64, CD8VT1>, VEX_W; +} + +let Predicates = [HasERI] in { + defm VRCP28 : avx512_eri_s<0xCB, "vrcp28", X86rcp28s, SchedWriteFRcp.Scl>, + T8PD, EVEX_4V; + defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s, + SchedWriteFRsqrt.Scl>, T8PD, EVEX_4V; +} + +defm VGETEXP : avx512_eri_s<0x43, "vgetexp", X86fgetexpRnds, + SchedWriteFRnd.Scl>, T8PD, EVEX_4V; +/// avx512_fp28_p rcp28ps, rcp28pd, rsqrt28ps, rsqrt28pd + +multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + SDNode OpNode, X86FoldableSchedWrite sched> { + let ExeDomain = _.ExeDomain in { + defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src), OpcodeStr, "$src", "$src", + (OpNode (_.VT _.RC:$src), (i32 FROUND_CURRENT))>, + Sched<[sched]>; + + defm m : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.MemOp:$src), OpcodeStr, "$src", "$src", + (OpNode (_.VT + (bitconvert (_.LdFrag addr:$src))), + (i32 FROUND_CURRENT))>, + Sched<[sched.Folded, ReadAfterLd]>; + + defm mb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.ScalarMemOp:$src), OpcodeStr, + "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr, + (OpNode (_.VT + (X86VBroadcast (_.ScalarLdFrag addr:$src))), + (i32 FROUND_CURRENT))>, EVEX_B, + Sched<[sched.Folded, ReadAfterLd]>; + } +} +multiclass avx512_fp28_p_round<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + SDNode OpNode, X86FoldableSchedWrite sched> { + let ExeDomain = _.ExeDomain in + defm rb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src), OpcodeStr, + "{sae}, $src", "$src, {sae}", + (OpNode (_.VT _.RC:$src), (i32 FROUND_NO_EXC))>, + EVEX_B, Sched<[sched]>; +} + +multiclass avx512_eri<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86SchedWriteWidths sched> { + defm PSZ : avx512_fp28_p<opc, OpcodeStr#"ps", v16f32_info, OpNode, sched.ZMM>, + avx512_fp28_p_round<opc, OpcodeStr#"ps", v16f32_info, OpNode, sched.ZMM>, + T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>; + defm PDZ : avx512_fp28_p<opc, OpcodeStr#"pd", v8f64_info, OpNode, sched.ZMM>, + avx512_fp28_p_round<opc, OpcodeStr#"pd", v8f64_info, OpNode, sched.ZMM>, + T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +} + +multiclass avx512_fp_unaryop_packed<bits<8> opc, string OpcodeStr, + SDNode OpNode, X86SchedWriteWidths sched> { + // Define only if AVX512VL feature is present. + let Predicates = [HasVLX] in { + defm PSZ128 : avx512_fp28_p<opc, OpcodeStr#"ps", v4f32x_info, OpNode, sched.XMM>, + EVEX_V128, T8PD, EVEX_CD8<32, CD8VF>; + defm PSZ256 : avx512_fp28_p<opc, OpcodeStr#"ps", v8f32x_info, OpNode, sched.YMM>, + EVEX_V256, T8PD, EVEX_CD8<32, CD8VF>; + defm PDZ128 : avx512_fp28_p<opc, OpcodeStr#"pd", v2f64x_info, OpNode, sched.XMM>, + EVEX_V128, VEX_W, T8PD, EVEX_CD8<64, CD8VF>; + defm PDZ256 : avx512_fp28_p<opc, OpcodeStr#"pd", v4f64x_info, OpNode, sched.YMM>, + EVEX_V256, VEX_W, T8PD, EVEX_CD8<64, CD8VF>; + } +} + +let Predicates = [HasERI] in { + defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28, SchedWriteFRsqrt>, EVEX; + defm VRCP28 : avx512_eri<0xCA, "vrcp28", X86rcp28, SchedWriteFRcp>, EVEX; + defm VEXP2 : avx512_eri<0xC8, "vexp2", X86exp2, SchedWriteFAdd>, EVEX; +} +defm VGETEXP : avx512_eri<0x42, "vgetexp", X86fgetexpRnd, SchedWriteFRnd>, + avx512_fp_unaryop_packed<0x42, "vgetexp", X86fgetexpRnd, + SchedWriteFRnd>, EVEX; + +multiclass avx512_sqrt_packed_round<bits<8> opc, string OpcodeStr, + X86FoldableSchedWrite sched, X86VectorVTInfo _>{ + let ExeDomain = _.ExeDomain in + defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src, AVX512RC:$rc), OpcodeStr, "$rc, $src", "$src, $rc", + (_.VT (X86fsqrtRnd _.RC:$src, (i32 imm:$rc)))>, + EVEX, EVEX_B, EVEX_RC, Sched<[sched]>; +} + +multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr, + X86FoldableSchedWrite sched, X86VectorVTInfo _>{ + let ExeDomain = _.ExeDomain in { + defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src), OpcodeStr, "$src", "$src", + (_.VT (fsqrt _.RC:$src))>, EVEX, + Sched<[sched]>; + defm m: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.MemOp:$src), OpcodeStr, "$src", "$src", + (fsqrt (_.VT + (bitconvert (_.LdFrag addr:$src))))>, EVEX, + Sched<[sched.Folded, ReadAfterLd]>; + defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.ScalarMemOp:$src), OpcodeStr, + "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr, + (fsqrt (_.VT + (X86VBroadcast (_.ScalarLdFrag addr:$src))))>, + EVEX, EVEX_B, Sched<[sched.Folded, ReadAfterLd]>; + } +} + +multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr, + X86SchedWriteSizes sched> { + defm PSZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"), + sched.PS.ZMM, v16f32_info>, + EVEX_V512, PS, EVEX_CD8<32, CD8VF>; + defm PDZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"), + sched.PD.ZMM, v8f64_info>, + EVEX_V512, VEX_W, PD, EVEX_CD8<64, CD8VF>; + // Define only if AVX512VL feature is present. + let Predicates = [HasVLX] in { + defm PSZ128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"), + sched.PS.XMM, v4f32x_info>, + EVEX_V128, PS, EVEX_CD8<32, CD8VF>; + defm PSZ256 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"), + sched.PS.YMM, v8f32x_info>, + EVEX_V256, PS, EVEX_CD8<32, CD8VF>; + defm PDZ128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"), + sched.PD.XMM, v2f64x_info>, + EVEX_V128, VEX_W, PD, EVEX_CD8<64, CD8VF>; + defm PDZ256 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"), + sched.PD.YMM, v4f64x_info>, + EVEX_V256, VEX_W, PD, EVEX_CD8<64, CD8VF>; + } +} + +multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr, + X86SchedWriteSizes sched> { + defm PSZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "ps"), + sched.PS.ZMM, v16f32_info>, + EVEX_V512, PS, EVEX_CD8<32, CD8VF>; + defm PDZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "pd"), + sched.PD.ZMM, v8f64_info>, + EVEX_V512, VEX_W, PD, EVEX_CD8<64, CD8VF>; +} + +multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched, + X86VectorVTInfo _, string Name> { + let ExeDomain = _.ExeDomain in { + defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (X86fsqrtRnds (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (i32 FROUND_CURRENT))>, + Sched<[sched]>; + defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (X86fsqrtRnds (_.VT _.RC:$src1), + _.ScalarIntMemCPat:$src2, + (i32 FROUND_CURRENT))>, + Sched<[sched.Folded, ReadAfterLd]>; + defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr, + "$rc, $src2, $src1", "$src1, $src2, $rc", + (X86fsqrtRnds (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (i32 imm:$rc))>, + EVEX_B, EVEX_RC, Sched<[sched]>; + + let isCodeGenOnly = 1, hasSideEffects = 0, Predicates=[HasAVX512] in { + def r : I<opc, MRMSrcReg, (outs _.FRC:$dst), + (ins _.FRC:$src1, _.FRC:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, + Sched<[sched]>; + let mayLoad = 1 in + def m : I<opc, MRMSrcMem, (outs _.FRC:$dst), + (ins _.FRC:$src1, _.ScalarMemOp:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, + Sched<[sched.Folded, ReadAfterLd]>; + } + } + + let Predicates = [HasAVX512] in { + def : Pat<(_.EltVT (fsqrt _.FRC:$src)), + (!cast<Instruction>(Name#Zr) + (_.EltVT (IMPLICIT_DEF)), _.FRC:$src)>; + } + + let Predicates = [HasAVX512, OptForSize] in { + def : Pat<(_.EltVT (fsqrt (load addr:$src))), + (!cast<Instruction>(Name#Zm) + (_.EltVT (IMPLICIT_DEF)), addr:$src)>; + } +} + +multiclass avx512_sqrt_scalar_all<bits<8> opc, string OpcodeStr, + X86SchedWriteSizes sched> { + defm SSZ : avx512_sqrt_scalar<opc, OpcodeStr#"ss", sched.PS.Scl, f32x_info, NAME#"SS">, + EVEX_CD8<32, CD8VT1>, EVEX_4V, XS; + defm SDZ : avx512_sqrt_scalar<opc, OpcodeStr#"sd", sched.PD.Scl, f64x_info, NAME#"SD">, + EVEX_CD8<64, CD8VT1>, EVEX_4V, XD, VEX_W; +} + +defm VSQRT : avx512_sqrt_packed_all<0x51, "vsqrt", SchedWriteFSqrtSizes>, + avx512_sqrt_packed_all_round<0x51, "vsqrt", SchedWriteFSqrtSizes>; + +defm VSQRT : avx512_sqrt_scalar_all<0x51, "vsqrt", SchedWriteFSqrtSizes>, VEX_LIG; + +multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, + X86FoldableSchedWrite sched, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in { + defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, + "$src3, $src2, $src1", "$src1, $src2, $src3", + (_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT _.RC:$src2), + (i32 imm:$src3)))>, + Sched<[sched]>; + + defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, + "$src3, {sae}, $src2, $src1", "$src1, $src2, {sae}, $src3", + (_.VT (X86RndScalesRnd (_.VT _.RC:$src1), (_.VT _.RC:$src2), + (i32 imm:$src3), (i32 FROUND_NO_EXC)))>, EVEX_B, + Sched<[sched]>; + + defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.IntScalarMemOp:$src2, i32u8imm:$src3), + OpcodeStr, + "$src3, $src2, $src1", "$src1, $src2, $src3", + (_.VT (X86RndScales _.RC:$src1, + _.ScalarIntMemCPat:$src2, (i32 imm:$src3)))>, + Sched<[sched.Folded, ReadAfterLd]>; + + let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [HasAVX512] in { + def r : I<opc, MRMSrcReg, (outs _.FRC:$dst), + (ins _.FRC:$src1, _.FRC:$src2, i32u8imm:$src3), + OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, Sched<[sched]>; + + let mayLoad = 1 in + def m : I<opc, MRMSrcMem, (outs _.FRC:$dst), + (ins _.FRC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3), + OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, Sched<[sched.Folded, ReadAfterLd]>; + } + } + + let Predicates = [HasAVX512] in { + def : Pat<(ffloor _.FRC:$src), + (_.EltVT (!cast<Instruction>(NAME##r) (_.EltVT (IMPLICIT_DEF)), + _.FRC:$src, (i32 0x9)))>; + def : Pat<(fceil _.FRC:$src), + (_.EltVT (!cast<Instruction>(NAME##r) (_.EltVT (IMPLICIT_DEF)), + _.FRC:$src, (i32 0xa)))>; + def : Pat<(ftrunc _.FRC:$src), + (_.EltVT (!cast<Instruction>(NAME##r) (_.EltVT (IMPLICIT_DEF)), + _.FRC:$src, (i32 0xb)))>; + def : Pat<(frint _.FRC:$src), + (_.EltVT (!cast<Instruction>(NAME##r) (_.EltVT (IMPLICIT_DEF)), + _.FRC:$src, (i32 0x4)))>; + def : Pat<(fnearbyint _.FRC:$src), + (_.EltVT (!cast<Instruction>(NAME##r) (_.EltVT (IMPLICIT_DEF)), + _.FRC:$src, (i32 0xc)))>; + } + + let Predicates = [HasAVX512, OptForSize] in { + def : Pat<(ffloor (_.ScalarLdFrag addr:$src)), + (_.EltVT (!cast<Instruction>(NAME##m) (_.EltVT (IMPLICIT_DEF)), + addr:$src, (i32 0x9)))>; + def : Pat<(fceil (_.ScalarLdFrag addr:$src)), + (_.EltVT (!cast<Instruction>(NAME##m) (_.EltVT (IMPLICIT_DEF)), + addr:$src, (i32 0xa)))>; + def : Pat<(ftrunc (_.ScalarLdFrag addr:$src)), + (_.EltVT (!cast<Instruction>(NAME##m) (_.EltVT (IMPLICIT_DEF)), + addr:$src, (i32 0xb)))>; + def : Pat<(frint (_.ScalarLdFrag addr:$src)), + (_.EltVT (!cast<Instruction>(NAME##m) (_.EltVT (IMPLICIT_DEF)), + addr:$src, (i32 0x4)))>; + def : Pat<(fnearbyint (_.ScalarLdFrag addr:$src)), + (_.EltVT (!cast<Instruction>(NAME##m) (_.EltVT (IMPLICIT_DEF)), + addr:$src, (i32 0xc)))>; + } +} + +defm VRNDSCALESSZ : avx512_rndscale_scalar<0x0A, "vrndscaless", + SchedWriteFRnd.Scl, f32x_info>, + AVX512AIi8Base, EVEX_4V, + EVEX_CD8<32, CD8VT1>; + +defm VRNDSCALESDZ : avx512_rndscale_scalar<0x0B, "vrndscalesd", + SchedWriteFRnd.Scl, f64x_info>, + VEX_W, AVX512AIi8Base, EVEX_4V, + EVEX_CD8<64, CD8VT1>; + +multiclass avx512_masked_scalar<SDNode OpNode, string OpcPrefix, SDNode Move, + dag Mask, X86VectorVTInfo _, PatLeaf ZeroFP, + dag OutMask, Predicate BasePredicate> { + let Predicates = [BasePredicate] in { + def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects Mask, + (OpNode (extractelt _.VT:$src2, (iPTR 0))), + (extractelt _.VT:$dst, (iPTR 0))))), + (!cast<Instruction>("V"#OpcPrefix#r_Intk) + _.VT:$dst, OutMask, _.VT:$src2, _.VT:$src1)>; + + def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects Mask, + (OpNode (extractelt _.VT:$src2, (iPTR 0))), + ZeroFP))), + (!cast<Instruction>("V"#OpcPrefix#r_Intkz) + OutMask, _.VT:$src2, _.VT:$src1)>; + } +} + +defm : avx512_masked_scalar<fsqrt, "SQRTSSZ", X86Movss, + (v1i1 (scalar_to_vector (i8 (trunc (i32 GR32:$mask))))), v4f32x_info, + fp32imm0, (COPY_TO_REGCLASS $mask, VK1WM), HasAVX512>; +defm : avx512_masked_scalar<fsqrt, "SQRTSDZ", X86Movsd, + (v1i1 (scalar_to_vector (i8 (trunc (i32 GR32:$mask))))), v2f64x_info, + fp64imm0, (COPY_TO_REGCLASS $mask, VK1WM), HasAVX512>; + +multiclass avx512_masked_scalar_imm<SDNode OpNode, string OpcPrefix, SDNode Move, + X86VectorVTInfo _, PatLeaf ZeroFP, + bits<8> ImmV, Predicate BasePredicate> { + let Predicates = [BasePredicate] in { + def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects VK1WM:$mask, + (OpNode (extractelt _.VT:$src2, (iPTR 0))), + (extractelt _.VT:$dst, (iPTR 0))))), + (!cast<Instruction>("V"#OpcPrefix#Zr_Intk) + _.VT:$dst, VK1WM:$mask, _.VT:$src1, _.VT:$src2, (i32 ImmV))>; + + def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects VK1WM:$mask, + (OpNode (extractelt _.VT:$src2, (iPTR 0))), ZeroFP))), + (!cast<Instruction>("V"#OpcPrefix#Zr_Intkz) + VK1WM:$mask, _.VT:$src1, _.VT:$src2, (i32 ImmV))>; + } +} + +defm : avx512_masked_scalar_imm<ffloor, "RNDSCALESS", X86Movss, + v4f32x_info, fp32imm0, 0x01, HasAVX512>; +defm : avx512_masked_scalar_imm<fceil, "RNDSCALESS", X86Movss, + v4f32x_info, fp32imm0, 0x02, HasAVX512>; +defm : avx512_masked_scalar_imm<ffloor, "RNDSCALESD", X86Movsd, + v2f64x_info, fp64imm0, 0x01, HasAVX512>; +defm : avx512_masked_scalar_imm<fceil, "RNDSCALESD", X86Movsd, + v2f64x_info, fp64imm0, 0x02, HasAVX512>; + + +//------------------------------------------------- +// Integer truncate and extend operations +//------------------------------------------------- + +multiclass avx512_trunc_common<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, X86VectorVTInfo SrcInfo, + X86VectorVTInfo DestInfo, X86MemOperand x86memop> { + let ExeDomain = DestInfo.ExeDomain in + defm rr : AVX512_maskable<opc, MRMDestReg, DestInfo, (outs DestInfo.RC:$dst), + (ins SrcInfo.RC:$src1), OpcodeStr ,"$src1", "$src1", + (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1)))>, + EVEX, T8XS, Sched<[sched]>; + + let mayStore = 1, hasSideEffects = 0, ExeDomain = DestInfo.ExeDomain in { + def mr : AVX512XS8I<opc, MRMDestMem, (outs), + (ins x86memop:$dst, SrcInfo.RC:$src), + OpcodeStr # "\t{$src, $dst|$dst, $src}", []>, + EVEX, Sched<[sched.Folded]>; + + def mrk : AVX512XS8I<opc, MRMDestMem, (outs), + (ins x86memop:$dst, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src), + OpcodeStr # "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}", []>, + EVEX, EVEX_K, Sched<[sched.Folded]>, NotMemoryFoldable; + }//mayStore = 1, hasSideEffects = 0 +} + +multiclass avx512_trunc_mr_lowering<X86VectorVTInfo SrcInfo, + X86VectorVTInfo DestInfo, + PatFrag truncFrag, PatFrag mtruncFrag, + string Name> { + + def : Pat<(truncFrag (SrcInfo.VT SrcInfo.RC:$src), addr:$dst), + (!cast<Instruction>(Name#SrcInfo.ZSuffix##mr) + addr:$dst, SrcInfo.RC:$src)>; + + def : Pat<(mtruncFrag addr:$dst, SrcInfo.KRCWM:$mask, + (SrcInfo.VT SrcInfo.RC:$src)), + (!cast<Instruction>(Name#SrcInfo.ZSuffix##mrk) + addr:$dst, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src)>; +} + +multiclass avx512_trunc<bits<8> opc, string OpcodeStr, SDNode OpNode128, + SDNode OpNode256, SDNode OpNode512, X86FoldableSchedWrite sched, + AVX512VLVectorVTInfo VTSrcInfo, + X86VectorVTInfo DestInfoZ128, + X86VectorVTInfo DestInfoZ256, X86VectorVTInfo DestInfoZ, + X86MemOperand x86memopZ128, X86MemOperand x86memopZ256, + X86MemOperand x86memopZ, PatFrag truncFrag, + PatFrag mtruncFrag, Predicate prd = HasAVX512>{ + + let Predicates = [HasVLX, prd] in { + defm Z128: avx512_trunc_common<opc, OpcodeStr, OpNode128, sched, + VTSrcInfo.info128, DestInfoZ128, x86memopZ128>, + avx512_trunc_mr_lowering<VTSrcInfo.info128, DestInfoZ128, + truncFrag, mtruncFrag, NAME>, EVEX_V128; + + defm Z256: avx512_trunc_common<opc, OpcodeStr, OpNode256, sched, + VTSrcInfo.info256, DestInfoZ256, x86memopZ256>, + avx512_trunc_mr_lowering<VTSrcInfo.info256, DestInfoZ256, + truncFrag, mtruncFrag, NAME>, EVEX_V256; + } + let Predicates = [prd] in + defm Z: avx512_trunc_common<opc, OpcodeStr, OpNode512, sched, + VTSrcInfo.info512, DestInfoZ, x86memopZ>, + avx512_trunc_mr_lowering<VTSrcInfo.info512, DestInfoZ, + truncFrag, mtruncFrag, NAME>, EVEX_V512; +} + +multiclass avx512_trunc_qb<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, PatFrag StoreNode, + PatFrag MaskedStoreNode, SDNode InVecNode = OpNode> { + defm NAME: avx512_trunc<opc, OpcodeStr, InVecNode, InVecNode, InVecNode, sched, + avx512vl_i64_info, v16i8x_info, v16i8x_info, + v16i8x_info, i16mem, i32mem, i64mem, StoreNode, + MaskedStoreNode>, EVEX_CD8<8, CD8VO>; +} + +multiclass avx512_trunc_qw<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, PatFrag StoreNode, + PatFrag MaskedStoreNode, SDNode InVecNode = OpNode> { + defm NAME: avx512_trunc<opc, OpcodeStr, InVecNode, InVecNode, OpNode, sched, + avx512vl_i64_info, v8i16x_info, v8i16x_info, + v8i16x_info, i32mem, i64mem, i128mem, StoreNode, + MaskedStoreNode>, EVEX_CD8<16, CD8VQ>; +} + +multiclass avx512_trunc_qd<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, PatFrag StoreNode, + PatFrag MaskedStoreNode, SDNode InVecNode = OpNode> { + defm NAME: avx512_trunc<opc, OpcodeStr, InVecNode, OpNode, OpNode, sched, + avx512vl_i64_info, v4i32x_info, v4i32x_info, + v8i32x_info, i64mem, i128mem, i256mem, StoreNode, + MaskedStoreNode>, EVEX_CD8<32, CD8VH>; +} + +multiclass avx512_trunc_db<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, PatFrag StoreNode, + PatFrag MaskedStoreNode, SDNode InVecNode = OpNode> { + defm NAME: avx512_trunc<opc, OpcodeStr, InVecNode, InVecNode, OpNode, sched, + avx512vl_i32_info, v16i8x_info, v16i8x_info, + v16i8x_info, i32mem, i64mem, i128mem, StoreNode, + MaskedStoreNode>, EVEX_CD8<8, CD8VQ>; +} + +multiclass avx512_trunc_dw<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, PatFrag StoreNode, + PatFrag MaskedStoreNode, SDNode InVecNode = OpNode> { + defm NAME: avx512_trunc<opc, OpcodeStr, InVecNode, OpNode, OpNode, sched, + avx512vl_i32_info, v8i16x_info, v8i16x_info, + v16i16x_info, i64mem, i128mem, i256mem, StoreNode, + MaskedStoreNode>, EVEX_CD8<16, CD8VH>; +} + +multiclass avx512_trunc_wb<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, PatFrag StoreNode, + PatFrag MaskedStoreNode, SDNode InVecNode = OpNode> { + defm NAME: avx512_trunc<opc, OpcodeStr, InVecNode, OpNode, OpNode, + sched, avx512vl_i16_info, v16i8x_info, v16i8x_info, + v32i8x_info, i64mem, i128mem, i256mem, StoreNode, + MaskedStoreNode, HasBWI>, EVEX_CD8<16, CD8VH>; +} + +defm VPMOVQB : avx512_trunc_qb<0x32, "vpmovqb", trunc, WriteShuffle256, + truncstorevi8, masked_truncstorevi8, X86vtrunc>; +defm VPMOVSQB : avx512_trunc_qb<0x22, "vpmovsqb", X86vtruncs, WriteShuffle256, + truncstore_s_vi8, masked_truncstore_s_vi8>; +defm VPMOVUSQB : avx512_trunc_qb<0x12, "vpmovusqb", X86vtruncus, WriteShuffle256, + truncstore_us_vi8, masked_truncstore_us_vi8>; + +defm VPMOVQW : avx512_trunc_qw<0x34, "vpmovqw", trunc, WriteShuffle256, + truncstorevi16, masked_truncstorevi16, X86vtrunc>; +defm VPMOVSQW : avx512_trunc_qw<0x24, "vpmovsqw", X86vtruncs, WriteShuffle256, + truncstore_s_vi16, masked_truncstore_s_vi16>; +defm VPMOVUSQW : avx512_trunc_qw<0x14, "vpmovusqw", X86vtruncus, WriteShuffle256, + truncstore_us_vi16, masked_truncstore_us_vi16>; + +defm VPMOVQD : avx512_trunc_qd<0x35, "vpmovqd", trunc, WriteShuffle256, + truncstorevi32, masked_truncstorevi32, X86vtrunc>; +defm VPMOVSQD : avx512_trunc_qd<0x25, "vpmovsqd", X86vtruncs, WriteShuffle256, + truncstore_s_vi32, masked_truncstore_s_vi32>; +defm VPMOVUSQD : avx512_trunc_qd<0x15, "vpmovusqd", X86vtruncus, WriteShuffle256, + truncstore_us_vi32, masked_truncstore_us_vi32>; + +defm VPMOVDB : avx512_trunc_db<0x31, "vpmovdb", trunc, WriteShuffle256, + truncstorevi8, masked_truncstorevi8, X86vtrunc>; +defm VPMOVSDB : avx512_trunc_db<0x21, "vpmovsdb", X86vtruncs, WriteShuffle256, + truncstore_s_vi8, masked_truncstore_s_vi8>; +defm VPMOVUSDB : avx512_trunc_db<0x11, "vpmovusdb", X86vtruncus, WriteShuffle256, + truncstore_us_vi8, masked_truncstore_us_vi8>; + +defm VPMOVDW : avx512_trunc_dw<0x33, "vpmovdw", trunc, WriteShuffle256, + truncstorevi16, masked_truncstorevi16, X86vtrunc>; +defm VPMOVSDW : avx512_trunc_dw<0x23, "vpmovsdw", X86vtruncs, WriteShuffle256, + truncstore_s_vi16, masked_truncstore_s_vi16>; +defm VPMOVUSDW : avx512_trunc_dw<0x13, "vpmovusdw", X86vtruncus, WriteShuffle256, + truncstore_us_vi16, masked_truncstore_us_vi16>; + +defm VPMOVWB : avx512_trunc_wb<0x30, "vpmovwb", trunc, WriteShuffle256, + truncstorevi8, masked_truncstorevi8, X86vtrunc>; +defm VPMOVSWB : avx512_trunc_wb<0x20, "vpmovswb", X86vtruncs, WriteShuffle256, + truncstore_s_vi8, masked_truncstore_s_vi8>; +defm VPMOVUSWB : avx512_trunc_wb<0x10, "vpmovuswb", X86vtruncus, WriteShuffle256, + truncstore_us_vi8, masked_truncstore_us_vi8>; + +let Predicates = [HasAVX512, NoVLX] in { +def: Pat<(v8i16 (trunc (v8i32 VR256X:$src))), + (v8i16 (EXTRACT_SUBREG + (v16i16 (VPMOVDWZrr (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), + VR256X:$src, sub_ymm)))), sub_xmm))>; +def: Pat<(v4i32 (trunc (v4i64 VR256X:$src))), + (v4i32 (EXTRACT_SUBREG + (v8i32 (VPMOVQDZrr (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), + VR256X:$src, sub_ymm)))), sub_xmm))>; +} + +let Predicates = [HasBWI, NoVLX] in { +def: Pat<(v16i8 (trunc (v16i16 VR256X:$src))), + (v16i8 (EXTRACT_SUBREG (VPMOVWBZrr (v32i16 (INSERT_SUBREG (IMPLICIT_DEF), + VR256X:$src, sub_ymm))), sub_xmm))>; +} + +multiclass WriteShuffle256_common<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched, + X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo, + X86MemOperand x86memop, PatFrag LdFrag, SDNode OpNode>{ + let ExeDomain = DestInfo.ExeDomain in { + defm rr : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst), + (ins SrcInfo.RC:$src), OpcodeStr ,"$src", "$src", + (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src)))>, + EVEX, Sched<[sched]>; + + defm rm : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst), + (ins x86memop:$src), OpcodeStr ,"$src", "$src", + (DestInfo.VT (LdFrag addr:$src))>, + EVEX, Sched<[sched.Folded]>; + } +} + +multiclass WriteShuffle256_BW<bits<8> opc, string OpcodeStr, + SDNode OpNode, SDNode InVecNode, string ExtTy, + X86FoldableSchedWrite sched, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> { + let Predicates = [HasVLX, HasBWI] in { + defm Z128: WriteShuffle256_common<opc, OpcodeStr, sched, v8i16x_info, + v16i8x_info, i64mem, LdFrag, InVecNode>, + EVEX_CD8<8, CD8VH>, T8PD, EVEX_V128, VEX_WIG; + + defm Z256: WriteShuffle256_common<opc, OpcodeStr, sched, v16i16x_info, + v16i8x_info, i128mem, LdFrag, OpNode>, + EVEX_CD8<8, CD8VH>, T8PD, EVEX_V256, VEX_WIG; + } + let Predicates = [HasBWI] in { + defm Z : WriteShuffle256_common<opc, OpcodeStr, sched, v32i16_info, + v32i8x_info, i256mem, LdFrag, OpNode>, + EVEX_CD8<8, CD8VH>, T8PD, EVEX_V512, VEX_WIG; + } +} + +multiclass WriteShuffle256_BD<bits<8> opc, string OpcodeStr, + SDNode OpNode, SDNode InVecNode, string ExtTy, + X86FoldableSchedWrite sched, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> { + let Predicates = [HasVLX, HasAVX512] in { + defm Z128: WriteShuffle256_common<opc, OpcodeStr, sched, v4i32x_info, + v16i8x_info, i32mem, LdFrag, InVecNode>, + EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V128, VEX_WIG; + + defm Z256: WriteShuffle256_common<opc, OpcodeStr, sched, v8i32x_info, + v16i8x_info, i64mem, LdFrag, OpNode>, + EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V256, VEX_WIG; + } + let Predicates = [HasAVX512] in { + defm Z : WriteShuffle256_common<opc, OpcodeStr, sched, v16i32_info, + v16i8x_info, i128mem, LdFrag, OpNode>, + EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V512, VEX_WIG; + } +} + +multiclass WriteShuffle256_BQ<bits<8> opc, string OpcodeStr, + SDNode OpNode, SDNode InVecNode, string ExtTy, + X86FoldableSchedWrite sched, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> { + let Predicates = [HasVLX, HasAVX512] in { + defm Z128: WriteShuffle256_common<opc, OpcodeStr, sched, v2i64x_info, + v16i8x_info, i16mem, LdFrag, InVecNode>, + EVEX_CD8<8, CD8VO>, T8PD, EVEX_V128, VEX_WIG; + + defm Z256: WriteShuffle256_common<opc, OpcodeStr, sched, v4i64x_info, + v16i8x_info, i32mem, LdFrag, OpNode>, + EVEX_CD8<8, CD8VO>, T8PD, EVEX_V256, VEX_WIG; + } + let Predicates = [HasAVX512] in { + defm Z : WriteShuffle256_common<opc, OpcodeStr, sched, v8i64_info, + v16i8x_info, i64mem, LdFrag, OpNode>, + EVEX_CD8<8, CD8VO>, T8PD, EVEX_V512, VEX_WIG; + } +} + +multiclass WriteShuffle256_WD<bits<8> opc, string OpcodeStr, + SDNode OpNode, SDNode InVecNode, string ExtTy, + X86FoldableSchedWrite sched, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> { + let Predicates = [HasVLX, HasAVX512] in { + defm Z128: WriteShuffle256_common<opc, OpcodeStr, sched, v4i32x_info, + v8i16x_info, i64mem, LdFrag, InVecNode>, + EVEX_CD8<16, CD8VH>, T8PD, EVEX_V128, VEX_WIG; + + defm Z256: WriteShuffle256_common<opc, OpcodeStr, sched, v8i32x_info, + v8i16x_info, i128mem, LdFrag, OpNode>, + EVEX_CD8<16, CD8VH>, T8PD, EVEX_V256, VEX_WIG; + } + let Predicates = [HasAVX512] in { + defm Z : WriteShuffle256_common<opc, OpcodeStr, sched, v16i32_info, + v16i16x_info, i256mem, LdFrag, OpNode>, + EVEX_CD8<16, CD8VH>, T8PD, EVEX_V512, VEX_WIG; + } +} + +multiclass WriteShuffle256_WQ<bits<8> opc, string OpcodeStr, + SDNode OpNode, SDNode InVecNode, string ExtTy, + X86FoldableSchedWrite sched, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> { + let Predicates = [HasVLX, HasAVX512] in { + defm Z128: WriteShuffle256_common<opc, OpcodeStr, sched, v2i64x_info, + v8i16x_info, i32mem, LdFrag, InVecNode>, + EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V128, VEX_WIG; + + defm Z256: WriteShuffle256_common<opc, OpcodeStr, sched, v4i64x_info, + v8i16x_info, i64mem, LdFrag, OpNode>, + EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V256, VEX_WIG; + } + let Predicates = [HasAVX512] in { + defm Z : WriteShuffle256_common<opc, OpcodeStr, sched, v8i64_info, + v8i16x_info, i128mem, LdFrag, OpNode>, + EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V512, VEX_WIG; + } +} + +multiclass WriteShuffle256_DQ<bits<8> opc, string OpcodeStr, + SDNode OpNode, SDNode InVecNode, string ExtTy, + X86FoldableSchedWrite sched, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi32")> { + + let Predicates = [HasVLX, HasAVX512] in { + defm Z128: WriteShuffle256_common<opc, OpcodeStr, sched, v2i64x_info, + v4i32x_info, i64mem, LdFrag, InVecNode>, + EVEX_CD8<32, CD8VH>, T8PD, EVEX_V128; + + defm Z256: WriteShuffle256_common<opc, OpcodeStr, sched, v4i64x_info, + v4i32x_info, i128mem, LdFrag, OpNode>, + EVEX_CD8<32, CD8VH>, T8PD, EVEX_V256; + } + let Predicates = [HasAVX512] in { + defm Z : WriteShuffle256_common<opc, OpcodeStr, sched, v8i64_info, + v8i32x_info, i256mem, LdFrag, OpNode>, + EVEX_CD8<32, CD8VH>, T8PD, EVEX_V512; + } +} + +defm VPMOVZXBW : WriteShuffle256_BW<0x30, "vpmovzxbw", X86vzext, zext_invec, "z", WriteShuffle256>; +defm VPMOVZXBD : WriteShuffle256_BD<0x31, "vpmovzxbd", X86vzext, zext_invec, "z", WriteShuffle256>; +defm VPMOVZXBQ : WriteShuffle256_BQ<0x32, "vpmovzxbq", X86vzext, zext_invec, "z", WriteShuffle256>; +defm VPMOVZXWD : WriteShuffle256_WD<0x33, "vpmovzxwd", X86vzext, zext_invec, "z", WriteShuffle256>; +defm VPMOVZXWQ : WriteShuffle256_WQ<0x34, "vpmovzxwq", X86vzext, zext_invec, "z", WriteShuffle256>; +defm VPMOVZXDQ : WriteShuffle256_DQ<0x35, "vpmovzxdq", X86vzext, zext_invec, "z", WriteShuffle256>; + +defm VPMOVSXBW: WriteShuffle256_BW<0x20, "vpmovsxbw", X86vsext, sext_invec, "s", WriteShuffle256>; +defm VPMOVSXBD: WriteShuffle256_BD<0x21, "vpmovsxbd", X86vsext, sext_invec, "s", WriteShuffle256>; +defm VPMOVSXBQ: WriteShuffle256_BQ<0x22, "vpmovsxbq", X86vsext, sext_invec, "s", WriteShuffle256>; +defm VPMOVSXWD: WriteShuffle256_WD<0x23, "vpmovsxwd", X86vsext, sext_invec, "s", WriteShuffle256>; +defm VPMOVSXWQ: WriteShuffle256_WQ<0x24, "vpmovsxwq", X86vsext, sext_invec, "s", WriteShuffle256>; +defm VPMOVSXDQ: WriteShuffle256_DQ<0x25, "vpmovsxdq", X86vsext, sext_invec, "s", WriteShuffle256>; + + +multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp, + SDNode InVecOp> { + // 128-bit patterns + let Predicates = [HasVLX, HasBWI] in { + def : Pat<(v8i16 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>; + def : Pat<(v8i16 (InVecOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), + (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>; + def : Pat<(v8i16 (InVecOp (v16i8 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>; + def : Pat<(v8i16 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>; + def : Pat<(v8i16 (InVecOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>; + } + let Predicates = [HasVLX] in { + def : Pat<(v4i32 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), + (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>; + def : Pat<(v4i32 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))), + (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>; + def : Pat<(v4i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>; + def : Pat<(v4i32 (InVecOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>; + + def : Pat<(v2i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))), + (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>; + def : Pat<(v2i64 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))), + (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>; + def : Pat<(v2i64 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>; + def : Pat<(v2i64 (InVecOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>; + + def : Pat<(v4i32 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>; + def : Pat<(v4i32 (InVecOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), + (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>; + def : Pat<(v4i32 (InVecOp (v8i16 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>; + def : Pat<(v4i32 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>; + def : Pat<(v4i32 (InVecOp (bc_v8i16 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>; + + def : Pat<(v2i64 (InVecOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), + (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>; + def : Pat<(v2i64 (InVecOp (v8i16 (vzmovl_v4i32 addr:$src)))), + (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>; + def : Pat<(v2i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>; + def : Pat<(v2i64 (InVecOp (bc_v8i16 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>; + + def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>; + def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), + (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>; + def : Pat<(v2i64 (InVecOp (v4i32 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>; + def : Pat<(v2i64 (InVecOp (v4i32 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>; + def : Pat<(v2i64 (InVecOp (bc_v4i32 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>; + } + // 256-bit patterns + let Predicates = [HasVLX, HasBWI] in { + def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWZ256rm) addr:$src)>; + def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWZ256rm) addr:$src)>; + def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWZ256rm) addr:$src)>; + } + let Predicates = [HasVLX] in { + def : Pat<(v8i32 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>; + + def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), + (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), + (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>; + + def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDZ256rm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDZ256rm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDZ256rm) addr:$src)>; + + def : Pat<(v4i64 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>; + + def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQZ256rm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQZ256rm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQZ256rm) addr:$src)>; + } + // 512-bit patterns + let Predicates = [HasBWI] in { + def : Pat<(v32i16 (ExtOp (bc_v32i8 (loadv4i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWZrm) addr:$src)>; + } + let Predicates = [HasAVX512] in { + def : Pat<(v16i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BDZrm) addr:$src)>; + + def : Pat<(v8i64 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (!cast<I>(OpcPrefix#BQZrm) addr:$src)>; + def : Pat<(v8i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BQZrm) addr:$src)>; + + def : Pat<(v16i32 (ExtOp (bc_v16i16 (loadv4i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDZrm) addr:$src)>; + + def : Pat<(v8i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WQZrm) addr:$src)>; + + def : Pat<(v8i64 (ExtOp (bc_v8i32 (loadv4i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQZrm) addr:$src)>; + } +} + +defm : AVX512_pmovx_patterns<"VPMOVSX", X86vsext, sext_invec>; +defm : AVX512_pmovx_patterns<"VPMOVZX", X86vzext, zext_invec>; + +//===----------------------------------------------------------------------===// +// GATHER - SCATTER Operations + +// FIXME: Improve scheduling of gather/scatter instructions. +multiclass avx512_gather<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + X86MemOperand memop, PatFrag GatherNode, + RegisterClass MaskRC = _.KRCWM> { + let Constraints = "@earlyclobber $dst, $src1 = $dst, $mask = $mask_wb", + ExeDomain = _.ExeDomain in + def rm : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst, MaskRC:$mask_wb), + (ins _.RC:$src1, MaskRC:$mask, memop:$src2), + !strconcat(OpcodeStr#_.Suffix, + "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), + [(set _.RC:$dst, MaskRC:$mask_wb, + (GatherNode (_.VT _.RC:$src1), MaskRC:$mask, + vectoraddr:$src2))]>, EVEX, EVEX_K, + EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteLoad]>; +} + +multiclass avx512_gather_q_pd<bits<8> dopc, bits<8> qopc, + AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> { + defm NAME##D##SUFF##Z: avx512_gather<dopc, OpcodeStr##"d", _.info512, + vy512xmem, mgatherv8i32>, EVEX_V512, VEX_W; + defm NAME##Q##SUFF##Z: avx512_gather<qopc, OpcodeStr##"q", _.info512, + vz512mem, mgatherv8i64>, EVEX_V512, VEX_W; +let Predicates = [HasVLX] in { + defm NAME##D##SUFF##Z256: avx512_gather<dopc, OpcodeStr##"d", _.info256, + vx256xmem, mgatherv4i32>, EVEX_V256, VEX_W; + defm NAME##Q##SUFF##Z256: avx512_gather<qopc, OpcodeStr##"q", _.info256, + vy256xmem, mgatherv4i64>, EVEX_V256, VEX_W; + defm NAME##D##SUFF##Z128: avx512_gather<dopc, OpcodeStr##"d", _.info128, + vx128xmem, mgatherv4i32>, EVEX_V128, VEX_W; + defm NAME##Q##SUFF##Z128: avx512_gather<qopc, OpcodeStr##"q", _.info128, + vx128xmem, mgatherv2i64>, EVEX_V128, VEX_W; +} +} + +multiclass avx512_gather_d_ps<bits<8> dopc, bits<8> qopc, + AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> { + defm NAME##D##SUFF##Z: avx512_gather<dopc, OpcodeStr##"d", _.info512, vz512mem, + mgatherv16i32>, EVEX_V512; + defm NAME##Q##SUFF##Z: avx512_gather<qopc, OpcodeStr##"q", _.info256, vz256mem, + mgatherv8i64>, EVEX_V512; +let Predicates = [HasVLX] in { + defm NAME##D##SUFF##Z256: avx512_gather<dopc, OpcodeStr##"d", _.info256, + vy256xmem, mgatherv8i32>, EVEX_V256; + defm NAME##Q##SUFF##Z256: avx512_gather<qopc, OpcodeStr##"q", _.info128, + vy128xmem, mgatherv4i64>, EVEX_V256; + defm NAME##D##SUFF##Z128: avx512_gather<dopc, OpcodeStr##"d", _.info128, + vx128xmem, mgatherv4i32>, EVEX_V128; + defm NAME##Q##SUFF##Z128: avx512_gather<qopc, OpcodeStr##"q", _.info128, + vx64xmem, mgatherv2i64, VK2WM>, + EVEX_V128; +} +} + + +defm VGATHER : avx512_gather_q_pd<0x92, 0x93, avx512vl_f64_info, "vgather", "PD">, + avx512_gather_d_ps<0x92, 0x93, avx512vl_f32_info, "vgather", "PS">; + +defm VPGATHER : avx512_gather_q_pd<0x90, 0x91, avx512vl_i64_info, "vpgather", "Q">, + avx512_gather_d_ps<0x90, 0x91, avx512vl_i32_info, "vpgather", "D">; + +multiclass avx512_scatter<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + X86MemOperand memop, PatFrag ScatterNode, + RegisterClass MaskRC = _.KRCWM> { + +let mayStore = 1, Constraints = "$mask = $mask_wb", ExeDomain = _.ExeDomain in + + def mr : AVX5128I<opc, MRMDestMem, (outs MaskRC:$mask_wb), + (ins memop:$dst, MaskRC:$mask, _.RC:$src), + !strconcat(OpcodeStr#_.Suffix, + "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"), + [(set MaskRC:$mask_wb, (ScatterNode (_.VT _.RC:$src), + MaskRC:$mask, vectoraddr:$dst))]>, + EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>, + Sched<[WriteStore]>; +} + +multiclass avx512_scatter_q_pd<bits<8> dopc, bits<8> qopc, + AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> { + defm NAME##D##SUFF##Z: avx512_scatter<dopc, OpcodeStr##"d", _.info512, + vy512xmem, mscatterv8i32>, EVEX_V512, VEX_W; + defm NAME##Q##SUFF##Z: avx512_scatter<qopc, OpcodeStr##"q", _.info512, + vz512mem, mscatterv8i64>, EVEX_V512, VEX_W; +let Predicates = [HasVLX] in { + defm NAME##D##SUFF##Z256: avx512_scatter<dopc, OpcodeStr##"d", _.info256, + vx256xmem, mscatterv4i32>, EVEX_V256, VEX_W; + defm NAME##Q##SUFF##Z256: avx512_scatter<qopc, OpcodeStr##"q", _.info256, + vy256xmem, mscatterv4i64>, EVEX_V256, VEX_W; + defm NAME##D##SUFF##Z128: avx512_scatter<dopc, OpcodeStr##"d", _.info128, + vx128xmem, mscatterv4i32>, EVEX_V128, VEX_W; + defm NAME##Q##SUFF##Z128: avx512_scatter<qopc, OpcodeStr##"q", _.info128, + vx128xmem, mscatterv2i64>, EVEX_V128, VEX_W; +} +} + +multiclass avx512_scatter_d_ps<bits<8> dopc, bits<8> qopc, + AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> { + defm NAME##D##SUFF##Z: avx512_scatter<dopc, OpcodeStr##"d", _.info512, vz512mem, + mscatterv16i32>, EVEX_V512; + defm NAME##Q##SUFF##Z: avx512_scatter<qopc, OpcodeStr##"q", _.info256, vz256mem, + mscatterv8i64>, EVEX_V512; +let Predicates = [HasVLX] in { + defm NAME##D##SUFF##Z256: avx512_scatter<dopc, OpcodeStr##"d", _.info256, + vy256xmem, mscatterv8i32>, EVEX_V256; + defm NAME##Q##SUFF##Z256: avx512_scatter<qopc, OpcodeStr##"q", _.info128, + vy128xmem, mscatterv4i64>, EVEX_V256; + defm NAME##D##SUFF##Z128: avx512_scatter<dopc, OpcodeStr##"d", _.info128, + vx128xmem, mscatterv4i32>, EVEX_V128; + defm NAME##Q##SUFF##Z128: avx512_scatter<qopc, OpcodeStr##"q", _.info128, + vx64xmem, mscatterv2i64, VK2WM>, + EVEX_V128; +} +} + +defm VSCATTER : avx512_scatter_q_pd<0xA2, 0xA3, avx512vl_f64_info, "vscatter", "PD">, + avx512_scatter_d_ps<0xA2, 0xA3, avx512vl_f32_info, "vscatter", "PS">; + +defm VPSCATTER : avx512_scatter_q_pd<0xA0, 0xA1, avx512vl_i64_info, "vpscatter", "Q">, + avx512_scatter_d_ps<0xA0, 0xA1, avx512vl_i32_info, "vpscatter", "D">; + +// prefetch +multiclass avx512_gather_scatter_prefetch<bits<8> opc, Format F, string OpcodeStr, + RegisterClass KRC, X86MemOperand memop> { + let Predicates = [HasPFI], mayLoad = 1, mayStore = 1 in + def m : AVX5128I<opc, F, (outs), (ins KRC:$mask, memop:$src), + !strconcat(OpcodeStr, "\t{$src {${mask}}|{${mask}}, $src}"), []>, + EVEX, EVEX_K, Sched<[WriteLoad]>; +} + +defm VGATHERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dps", + VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; + +defm VGATHERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qps", + VK8WM, vz256mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>; + +defm VGATHERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dpd", + VK8WM, vy512xmem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>; + +defm VGATHERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qpd", + VK8WM, vz512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; + +defm VGATHERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dps", + VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; + +defm VGATHERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qps", + VK8WM, vz256mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>; + +defm VGATHERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dpd", + VK8WM, vy512xmem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>; + +defm VGATHERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qpd", + VK8WM, vz512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; + +defm VSCATTERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dps", + VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; + +defm VSCATTERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qps", + VK8WM, vz256mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>; + +defm VSCATTERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dpd", + VK8WM, vy512xmem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>; + +defm VSCATTERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qpd", + VK8WM, vz512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; + +defm VSCATTERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dps", + VK16WM, vz512mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>; + +defm VSCATTERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qps", + VK8WM, vz256mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>; + +defm VSCATTERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dpd", + VK8WM, vy512xmem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>; + +defm VSCATTERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qpd", + VK8WM, vz512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; + +multiclass cvt_by_vec_width<bits<8> opc, X86VectorVTInfo Vec, string OpcodeStr > { +def rr : AVX512XS8I<opc, MRMSrcReg, (outs Vec.RC:$dst), (ins Vec.KRC:$src), + !strconcat(OpcodeStr##Vec.Suffix, "\t{$src, $dst|$dst, $src}"), + [(set Vec.RC:$dst, (Vec.VT (sext Vec.KRC:$src)))]>, + EVEX, Sched<[WriteMove]>; // TODO - WriteVecTrunc? +} + +multiclass cvt_mask_by_elt_width<bits<8> opc, AVX512VLVectorVTInfo VTInfo, + string OpcodeStr, Predicate prd> { +let Predicates = [prd] in + defm Z : cvt_by_vec_width<opc, VTInfo.info512, OpcodeStr>, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : cvt_by_vec_width<opc, VTInfo.info256, OpcodeStr>, EVEX_V256; + defm Z128 : cvt_by_vec_width<opc, VTInfo.info128, OpcodeStr>, EVEX_V128; + } +} + +defm VPMOVM2B : cvt_mask_by_elt_width<0x28, avx512vl_i8_info, "vpmovm2" , HasBWI>; +defm VPMOVM2W : cvt_mask_by_elt_width<0x28, avx512vl_i16_info, "vpmovm2", HasBWI> , VEX_W; +defm VPMOVM2D : cvt_mask_by_elt_width<0x38, avx512vl_i32_info, "vpmovm2", HasDQI>; +defm VPMOVM2Q : cvt_mask_by_elt_width<0x38, avx512vl_i64_info, "vpmovm2", HasDQI> , VEX_W; + +multiclass convert_vector_to_mask_common<bits<8> opc, X86VectorVTInfo _, string OpcodeStr > { + def rr : AVX512XS8I<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.RC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set _.KRC:$dst, (X86pcmpgtm _.ImmAllZerosV, (_.VT _.RC:$src)))]>, + EVEX, Sched<[WriteMove]>; +} + +// Use 512bit version to implement 128/256 bit in case NoVLX. +multiclass convert_vector_to_mask_lowering<X86VectorVTInfo ExtendInfo, + X86VectorVTInfo _, + string Name> { + + def : Pat<(_.KVT (X86pcmpgtm _.ImmAllZerosV, (_.VT _.RC:$src))), + (_.KVT (COPY_TO_REGCLASS + (!cast<Instruction>(Name#"Zrr") + (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)), + _.RC:$src, _.SubRegIdx)), + _.KRC))>; +} + +multiclass avx512_convert_vector_to_mask<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo, Predicate prd> { + let Predicates = [prd] in + defm Z : convert_vector_to_mask_common <opc, VTInfo.info512, OpcodeStr>, + EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : convert_vector_to_mask_common<opc, VTInfo.info256, OpcodeStr>, + EVEX_V256; + defm Z128 : convert_vector_to_mask_common<opc, VTInfo.info128, OpcodeStr>, + EVEX_V128; + } + let Predicates = [prd, NoVLX] in { + defm Z256_Alt : convert_vector_to_mask_lowering<VTInfo.info512, VTInfo.info256, NAME>; + defm Z128_Alt : convert_vector_to_mask_lowering<VTInfo.info512, VTInfo.info128, NAME>; + } +} + +defm VPMOVB2M : avx512_convert_vector_to_mask<0x29, "vpmovb2m", + avx512vl_i8_info, HasBWI>; +defm VPMOVW2M : avx512_convert_vector_to_mask<0x29, "vpmovw2m", + avx512vl_i16_info, HasBWI>, VEX_W; +defm VPMOVD2M : avx512_convert_vector_to_mask<0x39, "vpmovd2m", + avx512vl_i32_info, HasDQI>; +defm VPMOVQ2M : avx512_convert_vector_to_mask<0x39, "vpmovq2m", + avx512vl_i64_info, HasDQI>, VEX_W; + +// Patterns for handling sext from a mask register to v16i8/v16i16 when DQI +// is available, but BWI is not. We can't handle this in lowering because +// a target independent DAG combine likes to combine sext and trunc. +let Predicates = [HasDQI, NoBWI] in { + def : Pat<(v16i8 (sext (v16i1 VK16:$src))), + (VPMOVDBZrr (v16i32 (VPMOVM2DZrr VK16:$src)))>; + def : Pat<(v16i16 (sext (v16i1 VK16:$src))), + (VPMOVDWZrr (v16i32 (VPMOVM2DZrr VK16:$src)))>; +} + +//===----------------------------------------------------------------------===// +// AVX-512 - COMPRESS and EXPAND +// + +multiclass compress_by_vec_width_common<bits<8> opc, X86VectorVTInfo _, + string OpcodeStr, X86FoldableSchedWrite sched> { + defm rr : AVX512_maskable<opc, MRMDestReg, _, (outs _.RC:$dst), + (ins _.RC:$src1), OpcodeStr, "$src1", "$src1", + (_.VT (X86compress _.RC:$src1))>, AVX5128IBase, + Sched<[sched]>; + + let mayStore = 1, hasSideEffects = 0 in + def mr : AVX5128I<opc, MRMDestMem, (outs), + (ins _.MemOp:$dst, _.RC:$src), + OpcodeStr # "\t{$src, $dst|$dst, $src}", + []>, EVEX_CD8<_.EltSize, CD8VT1>, + Sched<[sched.Folded]>; + + def mrk : AVX5128I<opc, MRMDestMem, (outs), + (ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src), + OpcodeStr # "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}", + []>, + EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>, + Sched<[sched.Folded]>; +} + +multiclass compress_by_vec_width_lowering<X86VectorVTInfo _, string Name> { + def : Pat<(X86mCompressingStore addr:$dst, _.KRCWM:$mask, + (_.VT _.RC:$src)), + (!cast<Instruction>(Name#_.ZSuffix##mrk) + addr:$dst, _.KRCWM:$mask, _.RC:$src)>; +} + +multiclass compress_by_elt_width<bits<8> opc, string OpcodeStr, + X86FoldableSchedWrite sched, + AVX512VLVectorVTInfo VTInfo, + Predicate Pred = HasAVX512> { + let Predicates = [Pred] in + defm Z : compress_by_vec_width_common<opc, VTInfo.info512, OpcodeStr, sched>, + compress_by_vec_width_lowering<VTInfo.info512, NAME>, EVEX_V512; + + let Predicates = [Pred, HasVLX] in { + defm Z256 : compress_by_vec_width_common<opc, VTInfo.info256, OpcodeStr, sched>, + compress_by_vec_width_lowering<VTInfo.info256, NAME>, EVEX_V256; + defm Z128 : compress_by_vec_width_common<opc, VTInfo.info128, OpcodeStr, sched>, + compress_by_vec_width_lowering<VTInfo.info128, NAME>, EVEX_V128; + } +} + +// FIXME: Is there a better scheduler class for VPCOMPRESS? +defm VPCOMPRESSD : compress_by_elt_width <0x8B, "vpcompressd", WriteVarShuffle256, + avx512vl_i32_info>, EVEX, NotMemoryFoldable; +defm VPCOMPRESSQ : compress_by_elt_width <0x8B, "vpcompressq", WriteVarShuffle256, + avx512vl_i64_info>, EVEX, VEX_W, NotMemoryFoldable; +defm VCOMPRESSPS : compress_by_elt_width <0x8A, "vcompressps", WriteVarShuffle256, + avx512vl_f32_info>, EVEX, NotMemoryFoldable; +defm VCOMPRESSPD : compress_by_elt_width <0x8A, "vcompresspd", WriteVarShuffle256, + avx512vl_f64_info>, EVEX, VEX_W, NotMemoryFoldable; + +// expand +multiclass expand_by_vec_width<bits<8> opc, X86VectorVTInfo _, + string OpcodeStr, X86FoldableSchedWrite sched> { + defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1), OpcodeStr, "$src1", "$src1", + (_.VT (X86expand _.RC:$src1))>, AVX5128IBase, + Sched<[sched]>; + + defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.MemOp:$src1), OpcodeStr, "$src1", "$src1", + (_.VT (X86expand (_.VT (bitconvert + (_.LdFrag addr:$src1)))))>, + AVX5128IBase, EVEX_CD8<_.EltSize, CD8VT1>, + Sched<[sched.Folded, ReadAfterLd]>; +} + +multiclass expand_by_vec_width_lowering<X86VectorVTInfo _, string Name> { + + def : Pat<(_.VT (X86mExpandingLoad addr:$src, _.KRCWM:$mask, undef)), + (!cast<Instruction>(Name#_.ZSuffix##rmkz) + _.KRCWM:$mask, addr:$src)>; + + def : Pat<(_.VT (X86mExpandingLoad addr:$src, _.KRCWM:$mask, _.ImmAllZerosV)), + (!cast<Instruction>(Name#_.ZSuffix##rmkz) + _.KRCWM:$mask, addr:$src)>; + + def : Pat<(_.VT (X86mExpandingLoad addr:$src, _.KRCWM:$mask, + (_.VT _.RC:$src0))), + (!cast<Instruction>(Name#_.ZSuffix##rmk) + _.RC:$src0, _.KRCWM:$mask, addr:$src)>; +} + +multiclass expand_by_elt_width<bits<8> opc, string OpcodeStr, + X86FoldableSchedWrite sched, + AVX512VLVectorVTInfo VTInfo, + Predicate Pred = HasAVX512> { + let Predicates = [Pred] in + defm Z : expand_by_vec_width<opc, VTInfo.info512, OpcodeStr, sched>, + expand_by_vec_width_lowering<VTInfo.info512, NAME>, EVEX_V512; + + let Predicates = [Pred, HasVLX] in { + defm Z256 : expand_by_vec_width<opc, VTInfo.info256, OpcodeStr, sched>, + expand_by_vec_width_lowering<VTInfo.info256, NAME>, EVEX_V256; + defm Z128 : expand_by_vec_width<opc, VTInfo.info128, OpcodeStr, sched>, + expand_by_vec_width_lowering<VTInfo.info128, NAME>, EVEX_V128; + } +} + +// FIXME: Is there a better scheduler class for VPEXPAND? +defm VPEXPANDD : expand_by_elt_width <0x89, "vpexpandd", WriteVarShuffle256, + avx512vl_i32_info>, EVEX; +defm VPEXPANDQ : expand_by_elt_width <0x89, "vpexpandq", WriteVarShuffle256, + avx512vl_i64_info>, EVEX, VEX_W; +defm VEXPANDPS : expand_by_elt_width <0x88, "vexpandps", WriteVarShuffle256, + avx512vl_f32_info>, EVEX; +defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", WriteVarShuffle256, + avx512vl_f64_info>, EVEX, VEX_W; + +//handle instruction reg_vec1 = op(reg_vec,imm) +// op(mem_vec,imm) +// op(broadcast(eltVt),imm) +//all instruction created with FROUND_CURRENT +multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in { + defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2", + (OpNode (_.VT _.RC:$src1), + (i32 imm:$src2))>, Sched<[sched]>; + defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.MemOp:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2", + (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), + (i32 imm:$src2))>, + Sched<[sched.Folded, ReadAfterLd]>; + defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.ScalarMemOp:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix, "$src2, ${src1}"##_.BroadcastStr, + "${src1}"##_.BroadcastStr##", $src2", + (OpNode (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src1))), + (i32 imm:$src2))>, EVEX_B, + Sched<[sched.Folded, ReadAfterLd]>; + } +} + +//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae} +multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr, + SDNode OpNode, X86FoldableSchedWrite sched, + X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in + defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix, "$src2, {sae}, $src1", + "$src1, {sae}, $src2", + (OpNode (_.VT _.RC:$src1), + (i32 imm:$src2), + (i32 FROUND_NO_EXC))>, + EVEX_B, Sched<[sched]>; +} + +multiclass avx512_common_unary_fp_sae_packed_imm<string OpcodeStr, + AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, + SDNode OpNodeRnd, X86SchedWriteWidths sched, Predicate prd>{ + let Predicates = [prd] in { + defm Z : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, sched.ZMM, + _.info512>, + avx512_unary_fp_sae_packed_imm<opc, OpcodeStr, OpNodeRnd, + sched.ZMM, _.info512>, EVEX_V512; + } + let Predicates = [prd, HasVLX] in { + defm Z128 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, sched.XMM, + _.info128>, EVEX_V128; + defm Z256 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, sched.YMM, + _.info256>, EVEX_V256; + } +} + +//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm) +// op(reg_vec2,mem_vec,imm) +// op(reg_vec2,broadcast(eltVt),imm) +//all instruction created with FROUND_CURRENT +multiclass avx512_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, X86VectorVTInfo _>{ + let ExeDomain = _.ExeDomain in { + defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), + OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", + (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (i32 imm:$src3))>, + Sched<[sched]>; + defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3), + OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", + (OpNode (_.VT _.RC:$src1), + (_.VT (bitconvert (_.LdFrag addr:$src2))), + (i32 imm:$src3))>, + Sched<[sched.Folded, ReadAfterLd]>; + defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3), + OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1", + "$src1, ${src2}"##_.BroadcastStr##", $src3", + (OpNode (_.VT _.RC:$src1), + (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), + (i32 imm:$src3))>, EVEX_B, + Sched<[sched.Folded, ReadAfterLd]>; + } +} + +//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm) +// op(reg_vec2,mem_vec,imm) +multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, X86VectorVTInfo DestInfo, + X86VectorVTInfo SrcInfo>{ + let ExeDomain = DestInfo.ExeDomain in { + defm rri : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst), + (ins SrcInfo.RC:$src1, SrcInfo.RC:$src2, u8imm:$src3), + OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", + (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1), + (SrcInfo.VT SrcInfo.RC:$src2), + (i8 imm:$src3)))>, + Sched<[sched]>; + defm rmi : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst), + (ins SrcInfo.RC:$src1, SrcInfo.MemOp:$src2, u8imm:$src3), + OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", + (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1), + (SrcInfo.VT (bitconvert + (SrcInfo.LdFrag addr:$src2))), + (i8 imm:$src3)))>, + Sched<[sched.Folded, ReadAfterLd]>; + } +} + +//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm) +// op(reg_vec2,mem_vec,imm) +// op(reg_vec2,broadcast(eltVt),imm) +multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, X86VectorVTInfo _>: + avx512_3Op_rm_imm8<opc, OpcodeStr, OpNode, sched, _, _>{ + + let ExeDomain = _.ExeDomain in + defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3), + OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1", + "$src1, ${src2}"##_.BroadcastStr##", $src3", + (OpNode (_.VT _.RC:$src1), + (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), + (i8 imm:$src3))>, EVEX_B, + Sched<[sched.Folded, ReadAfterLd]>; +} + +//handle scalar instruction reg_vec1 = op(reg_vec2,reg_vec3,imm) +// op(reg_vec2,mem_scalar,imm) +multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in { + defm rri : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), + OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", + (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (i32 imm:$src3))>, + Sched<[sched]>; + defm rmi : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3), + OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", + (OpNode (_.VT _.RC:$src1), + (_.VT (scalar_to_vector + (_.ScalarLdFrag addr:$src2))), + (i32 imm:$src3))>, + Sched<[sched.Folded, ReadAfterLd]>; + } +} + +//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae} +multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr, + SDNode OpNode, X86FoldableSchedWrite sched, + X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in + defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), + OpcodeStr, "$src3, {sae}, $src2, $src1", + "$src1, $src2, {sae}, $src3", + (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (i32 imm:$src3), + (i32 FROUND_NO_EXC))>, + EVEX_B, Sched<[sched]>; +} + +//handle scalar instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae} +multiclass avx512_fp_sae_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in + defm NAME#rrib : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), + OpcodeStr, "$src3, {sae}, $src2, $src1", + "$src1, $src2, {sae}, $src3", + (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (i32 imm:$src3), + (i32 FROUND_NO_EXC))>, + EVEX_B, Sched<[sched]>; +} + +multiclass avx512_common_fp_sae_packed_imm<string OpcodeStr, + AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, + SDNode OpNodeRnd, X86SchedWriteWidths sched, Predicate prd>{ + let Predicates = [prd] in { + defm Z : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, sched.ZMM, _.info512>, + avx512_fp_sae_packed_imm<opc, OpcodeStr, OpNodeRnd, sched.ZMM, _.info512>, + EVEX_V512; + + } + let Predicates = [prd, HasVLX] in { + defm Z128 : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, sched.XMM, _.info128>, + EVEX_V128; + defm Z256 : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, sched.YMM, _.info256>, + EVEX_V256; + } +} + +multiclass avx512_common_3Op_rm_imm8<bits<8> opc, SDNode OpNode, string OpStr, + X86SchedWriteWidths sched, AVX512VLVectorVTInfo DestInfo, + AVX512VLVectorVTInfo SrcInfo, Predicate Pred = HasBWI> { + let Predicates = [Pred] in { + defm Z : avx512_3Op_rm_imm8<opc, OpStr, OpNode, sched.ZMM, DestInfo.info512, + SrcInfo.info512>, EVEX_V512, AVX512AIi8Base, EVEX_4V; + } + let Predicates = [Pred, HasVLX] in { + defm Z128 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, sched.XMM, DestInfo.info128, + SrcInfo.info128>, EVEX_V128, AVX512AIi8Base, EVEX_4V; + defm Z256 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, sched.YMM, DestInfo.info256, + SrcInfo.info256>, EVEX_V256, AVX512AIi8Base, EVEX_4V; + } +} + +multiclass avx512_common_3Op_imm8<string OpcodeStr, AVX512VLVectorVTInfo _, + bits<8> opc, SDNode OpNode, X86SchedWriteWidths sched, + Predicate Pred = HasAVX512> { + let Predicates = [Pred] in { + defm Z : avx512_3Op_imm8<opc, OpcodeStr, OpNode, sched.ZMM, _.info512>, + EVEX_V512; + } + let Predicates = [Pred, HasVLX] in { + defm Z128 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, sched.XMM, _.info128>, + EVEX_V128; + defm Z256 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, sched.YMM, _.info256>, + EVEX_V256; + } +} + +multiclass avx512_common_fp_sae_scalar_imm<string OpcodeStr, + X86VectorVTInfo _, bits<8> opc, SDNode OpNode, + SDNode OpNodeRnd, X86SchedWriteWidths sched, Predicate prd> { + let Predicates = [prd] in { + defm Z : avx512_fp_scalar_imm<opc, OpcodeStr, OpNode, sched.XMM, _>, + avx512_fp_sae_scalar_imm<opc, OpcodeStr, OpNodeRnd, sched.XMM, _>; + } +} + +multiclass avx512_common_unary_fp_sae_packed_imm_all<string OpcodeStr, + bits<8> opcPs, bits<8> opcPd, SDNode OpNode, + SDNode OpNodeRnd, X86SchedWriteWidths sched, Predicate prd>{ + defm PS : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f32_info, + opcPs, OpNode, OpNodeRnd, sched, prd>, + EVEX_CD8<32, CD8VF>; + defm PD : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f64_info, + opcPd, OpNode, OpNodeRnd, sched, prd>, + EVEX_CD8<64, CD8VF>, VEX_W; +} + +defm VREDUCE : avx512_common_unary_fp_sae_packed_imm_all<"vreduce", 0x56, 0x56, + X86VReduce, X86VReduceRnd, SchedWriteFRnd, HasDQI>, + AVX512AIi8Base, EVEX; +defm VRNDSCALE : avx512_common_unary_fp_sae_packed_imm_all<"vrndscale", 0x08, 0x09, + X86VRndScale, X86VRndScaleRnd, SchedWriteFRnd, HasAVX512>, + AVX512AIi8Base, EVEX; +defm VGETMANT : avx512_common_unary_fp_sae_packed_imm_all<"vgetmant", 0x26, 0x26, + X86VGetMant, X86VGetMantRnd, SchedWriteFRnd, HasAVX512>, + AVX512AIi8Base, EVEX; + +defm VRANGEPD : avx512_common_fp_sae_packed_imm<"vrangepd", avx512vl_f64_info, + 0x50, X86VRange, X86VRangeRnd, + SchedWriteFAdd, HasDQI>, + AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W; +defm VRANGEPS : avx512_common_fp_sae_packed_imm<"vrangeps", avx512vl_f32_info, + 0x50, X86VRange, X86VRangeRnd, + SchedWriteFAdd, HasDQI>, + AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>; + +defm VRANGESD: avx512_common_fp_sae_scalar_imm<"vrangesd", + f64x_info, 0x51, X86Ranges, X86RangesRnd, SchedWriteFAdd, HasDQI>, + AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W; +defm VRANGESS: avx512_common_fp_sae_scalar_imm<"vrangess", f32x_info, + 0x51, X86Ranges, X86RangesRnd, SchedWriteFAdd, HasDQI>, + AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>; + +defm VREDUCESD: avx512_common_fp_sae_scalar_imm<"vreducesd", f64x_info, + 0x57, X86Reduces, X86ReducesRnd, SchedWriteFRnd, HasDQI>, + AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W; +defm VREDUCESS: avx512_common_fp_sae_scalar_imm<"vreducess", f32x_info, + 0x57, X86Reduces, X86ReducesRnd, SchedWriteFRnd, HasDQI>, + AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>; + +defm VGETMANTSD: avx512_common_fp_sae_scalar_imm<"vgetmantsd", f64x_info, + 0x27, X86GetMants, X86GetMantsRnd, SchedWriteFRnd, HasAVX512>, + AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W; +defm VGETMANTSS: avx512_common_fp_sae_scalar_imm<"vgetmantss", f32x_info, + 0x27, X86GetMants, X86GetMantsRnd, SchedWriteFRnd, HasAVX512>, + AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>; + + +multiclass AVX512_rndscale_lowering<X86VectorVTInfo _, string Suffix> { + // Register + def : Pat<(_.VT (ffloor _.RC:$src)), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rri") + _.RC:$src, (i32 0x9))>; + def : Pat<(_.VT (fnearbyint _.RC:$src)), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rri") + _.RC:$src, (i32 0xC))>; + def : Pat<(_.VT (fceil _.RC:$src)), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rri") + _.RC:$src, (i32 0xA))>; + def : Pat<(_.VT (frint _.RC:$src)), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rri") + _.RC:$src, (i32 0x4))>; + def : Pat<(_.VT (ftrunc _.RC:$src)), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rri") + _.RC:$src, (i32 0xB))>; + + // Merge-masking + def : Pat<(_.VT (vselect _.KRCWM:$mask, (ffloor _.RC:$src), _.RC:$dst)), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrik") + _.RC:$dst, _.KRCWM:$mask, _.RC:$src, (i32 0x9))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, (fnearbyint _.RC:$src), _.RC:$dst)), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrik") + _.RC:$dst, _.KRCWM:$mask, _.RC:$src, (i32 0xC))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, (fceil _.RC:$src), _.RC:$dst)), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrik") + _.RC:$dst, _.KRCWM:$mask, _.RC:$src, (i32 0xA))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, (frint _.RC:$src), _.RC:$dst)), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrik") + _.RC:$dst, _.KRCWM:$mask, _.RC:$src, (i32 0x4))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, (ftrunc _.RC:$src), _.RC:$dst)), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrik") + _.RC:$dst, _.KRCWM:$mask, _.RC:$src, (i32 0xB))>; + + // Zero-masking + def : Pat<(_.VT (vselect _.KRCWM:$mask, (ffloor _.RC:$src), + _.ImmAllZerosV)), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrikz") + _.KRCWM:$mask, _.RC:$src, (i32 0x9))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, (fnearbyint _.RC:$src), + _.ImmAllZerosV)), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrikz") + _.KRCWM:$mask, _.RC:$src, (i32 0xC))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, (fceil _.RC:$src), + _.ImmAllZerosV)), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrikz") + _.KRCWM:$mask, _.RC:$src, (i32 0xA))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, (frint _.RC:$src), + _.ImmAllZerosV)), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrikz") + _.KRCWM:$mask, _.RC:$src, (i32 0x4))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, (ftrunc _.RC:$src), + _.ImmAllZerosV)), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rrikz") + _.KRCWM:$mask, _.RC:$src, (i32 0xB))>; + + // Load + def : Pat<(_.VT (ffloor (_.LdFrag addr:$src))), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmi") + addr:$src, (i32 0x9))>; + def : Pat<(_.VT (fnearbyint (_.LdFrag addr:$src))), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmi") + addr:$src, (i32 0xC))>; + def : Pat<(_.VT (fceil (_.LdFrag addr:$src))), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmi") + addr:$src, (i32 0xA))>; + def : Pat<(_.VT (frint (_.LdFrag addr:$src))), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmi") + addr:$src, (i32 0x4))>; + def : Pat<(_.VT (ftrunc (_.LdFrag addr:$src))), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmi") + addr:$src, (i32 0xB))>; + + // Merge-masking + load + def : Pat<(_.VT (vselect _.KRCWM:$mask, (ffloor (_.LdFrag addr:$src)), + _.RC:$dst)), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmik") + _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0x9))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, (fnearbyint (_.LdFrag addr:$src)), + _.RC:$dst)), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmik") + _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xC))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, (fceil (_.LdFrag addr:$src)), + _.RC:$dst)), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmik") + _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xA))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, (frint (_.LdFrag addr:$src)), + _.RC:$dst)), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmik") + _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0x4))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, (ftrunc (_.LdFrag addr:$src)), + _.RC:$dst)), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmik") + _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xB))>; + + // Zero-masking + load + def : Pat<(_.VT (vselect _.KRCWM:$mask, (ffloor (_.LdFrag addr:$src)), + _.ImmAllZerosV)), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmikz") + _.KRCWM:$mask, addr:$src, (i32 0x9))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, (fnearbyint (_.LdFrag addr:$src)), + _.ImmAllZerosV)), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmikz") + _.KRCWM:$mask, addr:$src, (i32 0xC))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, (fceil (_.LdFrag addr:$src)), + _.ImmAllZerosV)), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmikz") + _.KRCWM:$mask, addr:$src, (i32 0xA))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, (frint (_.LdFrag addr:$src)), + _.ImmAllZerosV)), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmikz") + _.KRCWM:$mask, addr:$src, (i32 0x4))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, (ftrunc (_.LdFrag addr:$src)), + _.ImmAllZerosV)), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmikz") + _.KRCWM:$mask, addr:$src, (i32 0xB))>; + + // Broadcast load + def : Pat<(_.VT (ffloor (X86VBroadcast (_.ScalarLdFrag addr:$src)))), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbi") + addr:$src, (i32 0x9))>; + def : Pat<(_.VT (fnearbyint (X86VBroadcast (_.ScalarLdFrag addr:$src)))), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbi") + addr:$src, (i32 0xC))>; + def : Pat<(_.VT (fceil (X86VBroadcast (_.ScalarLdFrag addr:$src)))), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbi") + addr:$src, (i32 0xA))>; + def : Pat<(_.VT (frint (X86VBroadcast (_.ScalarLdFrag addr:$src)))), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbi") + addr:$src, (i32 0x4))>; + def : Pat<(_.VT (ftrunc (X86VBroadcast (_.ScalarLdFrag addr:$src)))), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbi") + addr:$src, (i32 0xB))>; + + // Merge-masking + broadcast load + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (ffloor (X86VBroadcast (_.ScalarLdFrag addr:$src))), + _.RC:$dst)), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbik") + _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0x9))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (fnearbyint (X86VBroadcast (_.ScalarLdFrag addr:$src))), + _.RC:$dst)), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbik") + _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xC))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (fceil (X86VBroadcast (_.ScalarLdFrag addr:$src))), + _.RC:$dst)), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbik") + _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xA))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (frint (X86VBroadcast (_.ScalarLdFrag addr:$src))), + _.RC:$dst)), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbik") + _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0x4))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (ftrunc (X86VBroadcast (_.ScalarLdFrag addr:$src))), + _.RC:$dst)), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbik") + _.RC:$dst, _.KRCWM:$mask, addr:$src, (i32 0xB))>; + + // Zero-masking + broadcast load + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (ffloor (X86VBroadcast (_.ScalarLdFrag addr:$src))), + _.ImmAllZerosV)), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbikz") + _.KRCWM:$mask, addr:$src, (i32 0x9))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (fnearbyint (X86VBroadcast (_.ScalarLdFrag addr:$src))), + _.ImmAllZerosV)), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbikz") + _.KRCWM:$mask, addr:$src, (i32 0xC))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (fceil (X86VBroadcast (_.ScalarLdFrag addr:$src))), + _.ImmAllZerosV)), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbikz") + _.KRCWM:$mask, addr:$src, (i32 0xA))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (frint (X86VBroadcast (_.ScalarLdFrag addr:$src))), + _.ImmAllZerosV)), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbikz") + _.KRCWM:$mask, addr:$src, (i32 0x4))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (ftrunc (X86VBroadcast (_.ScalarLdFrag addr:$src))), + _.ImmAllZerosV)), + (!cast<Instruction>("VRNDSCALE"#Suffix#_.ZSuffix#"rmbikz") + _.KRCWM:$mask, addr:$src, (i32 0xB))>; +} + +let Predicates = [HasAVX512] in { + defm : AVX512_rndscale_lowering<v16f32_info, "PS">; + defm : AVX512_rndscale_lowering<v8f64_info, "PD">; +} + +let Predicates = [HasVLX] in { + defm : AVX512_rndscale_lowering<v8f32x_info, "PS">; + defm : AVX512_rndscale_lowering<v4f64x_info, "PD">; + defm : AVX512_rndscale_lowering<v4f32x_info, "PS">; + defm : AVX512_rndscale_lowering<v2f64x_info, "PD">; +} + +multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr, + X86FoldableSchedWrite sched, + X86VectorVTInfo _, + X86VectorVTInfo CastInfo, + string EVEX2VEXOvrd> { + let ExeDomain = _.ExeDomain in { + defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2, u8imm:$src3), + OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", + (_.VT (bitconvert + (CastInfo.VT (X86Shuf128 _.RC:$src1, _.RC:$src2, + (i8 imm:$src3)))))>, + Sched<[sched]>, EVEX2VEXOverride<EVEX2VEXOvrd#"rr">; + defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3), + OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", + (_.VT + (bitconvert + (CastInfo.VT (X86Shuf128 _.RC:$src1, + (bitconvert (_.LdFrag addr:$src2)), + (i8 imm:$src3)))))>, + Sched<[sched.Folded, ReadAfterLd]>, + EVEX2VEXOverride<EVEX2VEXOvrd#"rm">; + defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3), + OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1", + "$src1, ${src2}"##_.BroadcastStr##", $src3", + (_.VT + (bitconvert + (CastInfo.VT + (X86Shuf128 _.RC:$src1, + (X86VBroadcast (_.ScalarLdFrag addr:$src2)), + (i8 imm:$src3)))))>, EVEX_B, + Sched<[sched.Folded, ReadAfterLd]>; + } +} + +multiclass avx512_shuff_packed_128<string OpcodeStr, X86FoldableSchedWrite sched, + AVX512VLVectorVTInfo _, + AVX512VLVectorVTInfo CastInfo, bits<8> opc, + string EVEX2VEXOvrd>{ + let Predicates = [HasAVX512] in + defm Z : avx512_shuff_packed_128_common<opc, OpcodeStr, sched, + _.info512, CastInfo.info512, "">, EVEX_V512; + + let Predicates = [HasAVX512, HasVLX] in + defm Z256 : avx512_shuff_packed_128_common<opc, OpcodeStr, sched, + _.info256, CastInfo.info256, + EVEX2VEXOvrd>, EVEX_V256; +} + +defm VSHUFF32X4 : avx512_shuff_packed_128<"vshuff32x4", WriteFShuffle256, + avx512vl_f32_info, avx512vl_f64_info, 0x23, "VPERM2F128">, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>; +defm VSHUFF64X2 : avx512_shuff_packed_128<"vshuff64x2", WriteFShuffle256, + avx512vl_f64_info, avx512vl_f64_info, 0x23, "VPERM2F128">, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W; +defm VSHUFI32X4 : avx512_shuff_packed_128<"vshufi32x4", WriteFShuffle256, + avx512vl_i32_info, avx512vl_i64_info, 0x43, "VPERM2I128">, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>; +defm VSHUFI64X2 : avx512_shuff_packed_128<"vshufi64x2", WriteFShuffle256, + avx512vl_i64_info, avx512vl_i64_info, 0x43, "VPERM2I128">, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W; + +let Predicates = [HasAVX512] in { +// Provide fallback in case the load node that is used in the broadcast +// patterns above is used by additional users, which prevents the pattern +// selection. +def : Pat<(v8f64 (X86SubVBroadcast (v2f64 VR128X:$src))), + (VSHUFF64X2Zrri (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + 0)>; +def : Pat<(v8i64 (X86SubVBroadcast (v2i64 VR128X:$src))), + (VSHUFI64X2Zrri (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + 0)>; + +def : Pat<(v16f32 (X86SubVBroadcast (v4f32 VR128X:$src))), + (VSHUFF32X4Zrri (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + 0)>; +def : Pat<(v16i32 (X86SubVBroadcast (v4i32 VR128X:$src))), + (VSHUFI32X4Zrri (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + 0)>; + +def : Pat<(v32i16 (X86SubVBroadcast (v8i16 VR128X:$src))), + (VSHUFI32X4Zrri (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + 0)>; + +def : Pat<(v64i8 (X86SubVBroadcast (v16i8 VR128X:$src))), + (VSHUFI32X4Zrri (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + 0)>; +} + +multiclass avx512_valign<bits<8> opc, string OpcodeStr, + X86FoldableSchedWrite sched, X86VectorVTInfo _>{ + // NOTE: EVEX2VEXOverride changed back to Unset for 256-bit at the + // instantiation of this class. + let ExeDomain = _.ExeDomain in { + defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2, u8imm:$src3), + OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", + (_.VT (X86VAlign _.RC:$src1, _.RC:$src2, (i8 imm:$src3)))>, + Sched<[sched]>, EVEX2VEXOverride<"VPALIGNRrri">; + defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3), + OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", + (_.VT (X86VAlign _.RC:$src1, + (bitconvert (_.LdFrag addr:$src2)), + (i8 imm:$src3)))>, + Sched<[sched.Folded, ReadAfterLd]>, + EVEX2VEXOverride<"VPALIGNRrmi">; + + defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3), + OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1", + "$src1, ${src2}"##_.BroadcastStr##", $src3", + (X86VAlign _.RC:$src1, + (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), + (i8 imm:$src3))>, EVEX_B, + Sched<[sched.Folded, ReadAfterLd]>; + } +} + +multiclass avx512_valign_common<string OpcodeStr, X86SchedWriteWidths sched, + AVX512VLVectorVTInfo _> { + let Predicates = [HasAVX512] in { + defm Z : avx512_valign<0x03, OpcodeStr, sched.ZMM, _.info512>, + AVX512AIi8Base, EVEX_4V, EVEX_V512; + } + let Predicates = [HasAVX512, HasVLX] in { + defm Z128 : avx512_valign<0x03, OpcodeStr, sched.XMM, _.info128>, + AVX512AIi8Base, EVEX_4V, EVEX_V128; + // We can't really override the 256-bit version so change it back to unset. + let EVEX2VEXOverride = ? in + defm Z256 : avx512_valign<0x03, OpcodeStr, sched.YMM, _.info256>, + AVX512AIi8Base, EVEX_4V, EVEX_V256; + } +} + +defm VALIGND: avx512_valign_common<"valignd", SchedWriteShuffle, + avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; +defm VALIGNQ: avx512_valign_common<"valignq", SchedWriteShuffle, + avx512vl_i64_info>, EVEX_CD8<64, CD8VF>, + VEX_W; + +defm VPALIGNR: avx512_common_3Op_rm_imm8<0x0F, X86PAlignr, "vpalignr", + SchedWriteShuffle, avx512vl_i8_info, + avx512vl_i8_info>, EVEX_CD8<8, CD8VF>; + +// Fragments to help convert valignq into masked valignd. Or valignq/valignd +// into vpalignr. +def ValignqImm32XForm : SDNodeXForm<imm, [{ + return getI8Imm(N->getZExtValue() * 2, SDLoc(N)); +}]>; +def ValignqImm8XForm : SDNodeXForm<imm, [{ + return getI8Imm(N->getZExtValue() * 8, SDLoc(N)); +}]>; +def ValigndImm8XForm : SDNodeXForm<imm, [{ + return getI8Imm(N->getZExtValue() * 4, SDLoc(N)); +}]>; + +multiclass avx512_vpalign_mask_lowering<string OpcodeStr, SDNode OpNode, + X86VectorVTInfo From, X86VectorVTInfo To, + SDNodeXForm ImmXForm> { + def : Pat<(To.VT (vselect To.KRCWM:$mask, + (bitconvert + (From.VT (OpNode From.RC:$src1, From.RC:$src2, + imm:$src3))), + To.RC:$src0)), + (!cast<Instruction>(OpcodeStr#"rrik") To.RC:$src0, To.KRCWM:$mask, + To.RC:$src1, To.RC:$src2, + (ImmXForm imm:$src3))>; + + def : Pat<(To.VT (vselect To.KRCWM:$mask, + (bitconvert + (From.VT (OpNode From.RC:$src1, From.RC:$src2, + imm:$src3))), + To.ImmAllZerosV)), + (!cast<Instruction>(OpcodeStr#"rrikz") To.KRCWM:$mask, + To.RC:$src1, To.RC:$src2, + (ImmXForm imm:$src3))>; + + def : Pat<(To.VT (vselect To.KRCWM:$mask, + (bitconvert + (From.VT (OpNode From.RC:$src1, + (bitconvert (To.LdFrag addr:$src2)), + imm:$src3))), + To.RC:$src0)), + (!cast<Instruction>(OpcodeStr#"rmik") To.RC:$src0, To.KRCWM:$mask, + To.RC:$src1, addr:$src2, + (ImmXForm imm:$src3))>; + + def : Pat<(To.VT (vselect To.KRCWM:$mask, + (bitconvert + (From.VT (OpNode From.RC:$src1, + (bitconvert (To.LdFrag addr:$src2)), + imm:$src3))), + To.ImmAllZerosV)), + (!cast<Instruction>(OpcodeStr#"rmikz") To.KRCWM:$mask, + To.RC:$src1, addr:$src2, + (ImmXForm imm:$src3))>; +} + +multiclass avx512_vpalign_mask_lowering_mb<string OpcodeStr, SDNode OpNode, + X86VectorVTInfo From, + X86VectorVTInfo To, + SDNodeXForm ImmXForm> : + avx512_vpalign_mask_lowering<OpcodeStr, OpNode, From, To, ImmXForm> { + def : Pat<(From.VT (OpNode From.RC:$src1, + (bitconvert (To.VT (X86VBroadcast + (To.ScalarLdFrag addr:$src2)))), + imm:$src3)), + (!cast<Instruction>(OpcodeStr#"rmbi") To.RC:$src1, addr:$src2, + (ImmXForm imm:$src3))>; + + def : Pat<(To.VT (vselect To.KRCWM:$mask, + (bitconvert + (From.VT (OpNode From.RC:$src1, + (bitconvert + (To.VT (X86VBroadcast + (To.ScalarLdFrag addr:$src2)))), + imm:$src3))), + To.RC:$src0)), + (!cast<Instruction>(OpcodeStr#"rmbik") To.RC:$src0, To.KRCWM:$mask, + To.RC:$src1, addr:$src2, + (ImmXForm imm:$src3))>; + + def : Pat<(To.VT (vselect To.KRCWM:$mask, + (bitconvert + (From.VT (OpNode From.RC:$src1, + (bitconvert + (To.VT (X86VBroadcast + (To.ScalarLdFrag addr:$src2)))), + imm:$src3))), + To.ImmAllZerosV)), + (!cast<Instruction>(OpcodeStr#"rmbikz") To.KRCWM:$mask, + To.RC:$src1, addr:$src2, + (ImmXForm imm:$src3))>; +} + +let Predicates = [HasAVX512] in { + // For 512-bit we lower to the widest element type we can. So we only need + // to handle converting valignq to valignd. + defm : avx512_vpalign_mask_lowering_mb<"VALIGNDZ", X86VAlign, v8i64_info, + v16i32_info, ValignqImm32XForm>; +} + +let Predicates = [HasVLX] in { + // For 128-bit we lower to the widest element type we can. So we only need + // to handle converting valignq to valignd. + defm : avx512_vpalign_mask_lowering_mb<"VALIGNDZ128", X86VAlign, v2i64x_info, + v4i32x_info, ValignqImm32XForm>; + // For 256-bit we lower to the widest element type we can. So we only need + // to handle converting valignq to valignd. + defm : avx512_vpalign_mask_lowering_mb<"VALIGNDZ256", X86VAlign, v4i64x_info, + v8i32x_info, ValignqImm32XForm>; +} + +let Predicates = [HasVLX, HasBWI] in { + // We can turn 128 and 256 bit VALIGND/VALIGNQ into VPALIGNR. + defm : avx512_vpalign_mask_lowering<"VPALIGNRZ128", X86VAlign, v2i64x_info, + v16i8x_info, ValignqImm8XForm>; + defm : avx512_vpalign_mask_lowering<"VPALIGNRZ128", X86VAlign, v4i32x_info, + v16i8x_info, ValigndImm8XForm>; +} + +defm VDBPSADBW: avx512_common_3Op_rm_imm8<0x42, X86dbpsadbw, "vdbpsadbw", + SchedWritePSADBW, avx512vl_i16_info, avx512vl_i8_info>, + EVEX_CD8<8, CD8VF>, NotEVEX2VEXConvertible; + +multiclass avx512_unary_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in { + defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1), OpcodeStr, + "$src1", "$src1", + (_.VT (OpNode _.RC:$src1))>, EVEX, AVX5128IBase, + Sched<[sched]>; + + defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.MemOp:$src1), OpcodeStr, + "$src1", "$src1", + (_.VT (OpNode (bitconvert (_.LdFrag addr:$src1))))>, + EVEX, AVX5128IBase, EVEX_CD8<_.EltSize, CD8VF>, + Sched<[sched.Folded]>; + } +} + +multiclass avx512_unary_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, X86VectorVTInfo _> : + avx512_unary_rm<opc, OpcodeStr, OpNode, sched, _> { + defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.ScalarMemOp:$src1), OpcodeStr, + "${src1}"##_.BroadcastStr, + "${src1}"##_.BroadcastStr, + (_.VT (OpNode (X86VBroadcast + (_.ScalarLdFrag addr:$src1))))>, + EVEX, AVX5128IBase, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>, + Sched<[sched.Folded]>; +} + +multiclass avx512_unary_rm_vl<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86SchedWriteWidths sched, + AVX512VLVectorVTInfo VTInfo, Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_unary_rm<opc, OpcodeStr, OpNode, sched.ZMM, VTInfo.info512>, + EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_unary_rm<opc, OpcodeStr, OpNode, sched.YMM, VTInfo.info256>, + EVEX_V256; + defm Z128 : avx512_unary_rm<opc, OpcodeStr, OpNode, sched.XMM, VTInfo.info128>, + EVEX_V128; + } +} + +multiclass avx512_unary_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTInfo, + Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_unary_rmb<opc, OpcodeStr, OpNode, sched.ZMM, VTInfo.info512>, + EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_unary_rmb<opc, OpcodeStr, OpNode, sched.YMM, VTInfo.info256>, + EVEX_V256; + defm Z128 : avx512_unary_rmb<opc, OpcodeStr, OpNode, sched.XMM, VTInfo.info128>, + EVEX_V128; + } +} + +multiclass avx512_unary_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr, + SDNode OpNode, X86SchedWriteWidths sched, + Predicate prd> { + defm Q : avx512_unary_rmb_vl<opc_q, OpcodeStr#"q", OpNode, sched, + avx512vl_i64_info, prd>, VEX_W; + defm D : avx512_unary_rmb_vl<opc_d, OpcodeStr#"d", OpNode, sched, + avx512vl_i32_info, prd>; +} + +multiclass avx512_unary_rm_vl_bw<bits<8> opc_b, bits<8> opc_w, string OpcodeStr, + SDNode OpNode, X86SchedWriteWidths sched, + Predicate prd> { + defm W : avx512_unary_rm_vl<opc_w, OpcodeStr#"w", OpNode, sched, + avx512vl_i16_info, prd>, VEX_WIG; + defm B : avx512_unary_rm_vl<opc_b, OpcodeStr#"b", OpNode, sched, + avx512vl_i8_info, prd>, VEX_WIG; +} + +multiclass avx512_unary_rm_vl_all<bits<8> opc_b, bits<8> opc_w, + bits<8> opc_d, bits<8> opc_q, + string OpcodeStr, SDNode OpNode, + X86SchedWriteWidths sched> { + defm NAME : avx512_unary_rm_vl_dq<opc_d, opc_q, OpcodeStr, OpNode, sched, + HasAVX512>, + avx512_unary_rm_vl_bw<opc_b, opc_w, OpcodeStr, OpNode, sched, + HasBWI>; +} + +defm VPABS : avx512_unary_rm_vl_all<0x1C, 0x1D, 0x1E, 0x1F, "vpabs", abs, + SchedWriteVecALU>; + +// VPABS: Use 512bit version to implement 128/256 bit in case NoVLX. +let Predicates = [HasAVX512, NoVLX] in { + def : Pat<(v4i64 (abs VR256X:$src)), + (EXTRACT_SUBREG + (VPABSQZrr + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)), + sub_ymm)>; + def : Pat<(v2i64 (abs VR128X:$src)), + (EXTRACT_SUBREG + (VPABSQZrr + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)), + sub_xmm)>; +} + +// Use 512bit version to implement 128/256 bit. +multiclass avx512_unary_lowering<string InstrStr, SDNode OpNode, + AVX512VLVectorVTInfo _, Predicate prd> { + let Predicates = [prd, NoVLX] in { + def : Pat<(_.info256.VT(OpNode _.info256.RC:$src1)), + (EXTRACT_SUBREG + (!cast<Instruction>(InstrStr # "Zrr") + (INSERT_SUBREG(_.info512.VT(IMPLICIT_DEF)), + _.info256.RC:$src1, + _.info256.SubRegIdx)), + _.info256.SubRegIdx)>; + + def : Pat<(_.info128.VT(OpNode _.info128.RC:$src1)), + (EXTRACT_SUBREG + (!cast<Instruction>(InstrStr # "Zrr") + (INSERT_SUBREG(_.info512.VT(IMPLICIT_DEF)), + _.info128.RC:$src1, + _.info128.SubRegIdx)), + _.info128.SubRegIdx)>; + } +} + +defm VPLZCNT : avx512_unary_rm_vl_dq<0x44, 0x44, "vplzcnt", ctlz, + SchedWriteVecIMul, HasCDI>; + +// FIXME: Is there a better scheduler class for VPCONFLICT? +defm VPCONFLICT : avx512_unary_rm_vl_dq<0xC4, 0xC4, "vpconflict", X86Conflict, + SchedWriteVecALU, HasCDI>; + +// VPLZCNT: Use 512bit version to implement 128/256 bit in case NoVLX. +defm : avx512_unary_lowering<"VPLZCNTQ", ctlz, avx512vl_i64_info, HasCDI>; +defm : avx512_unary_lowering<"VPLZCNTD", ctlz, avx512vl_i32_info, HasCDI>; + +//===---------------------------------------------------------------------===// +// Counts number of ones - VPOPCNTD and VPOPCNTQ +//===---------------------------------------------------------------------===// + +// FIXME: Is there a better scheduler class for VPOPCNTD/VPOPCNTQ? +defm VPOPCNT : avx512_unary_rm_vl_dq<0x55, 0x55, "vpopcnt", ctpop, + SchedWriteVecALU, HasVPOPCNTDQ>; + +defm : avx512_unary_lowering<"VPOPCNTQ", ctpop, avx512vl_i64_info, HasVPOPCNTDQ>; +defm : avx512_unary_lowering<"VPOPCNTD", ctpop, avx512vl_i32_info, HasVPOPCNTDQ>; + +//===---------------------------------------------------------------------===// +// Replicate Single FP - MOVSHDUP and MOVSLDUP +//===---------------------------------------------------------------------===// + +multiclass avx512_replicate<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86SchedWriteWidths sched> { + defm NAME: avx512_unary_rm_vl<opc, OpcodeStr, OpNode, sched, + avx512vl_f32_info, HasAVX512>, XS; +} + +defm VMOVSHDUP : avx512_replicate<0x16, "vmovshdup", X86Movshdup, + SchedWriteFShuffle>; +defm VMOVSLDUP : avx512_replicate<0x12, "vmovsldup", X86Movsldup, + SchedWriteFShuffle>; + +//===----------------------------------------------------------------------===// +// AVX-512 - MOVDDUP +//===----------------------------------------------------------------------===// + +multiclass avx512_movddup_128<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in { + defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src), OpcodeStr, "$src", "$src", + (_.VT (OpNode (_.VT _.RC:$src)))>, EVEX, + Sched<[sched]>; + defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.ScalarMemOp:$src), OpcodeStr, "$src", "$src", + (_.VT (OpNode (_.VT (scalar_to_vector + (_.ScalarLdFrag addr:$src)))))>, + EVEX, EVEX_CD8<_.EltSize, CD8VH>, + Sched<[sched.Folded]>; + } +} + +multiclass avx512_movddup_common<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTInfo> { + defm Z : avx512_unary_rm<opc, OpcodeStr, X86Movddup, sched.ZMM, + VTInfo.info512>, EVEX_V512; + + let Predicates = [HasAVX512, HasVLX] in { + defm Z256 : avx512_unary_rm<opc, OpcodeStr, X86Movddup, sched.YMM, + VTInfo.info256>, EVEX_V256; + defm Z128 : avx512_movddup_128<opc, OpcodeStr, X86VBroadcast, sched.XMM, + VTInfo.info128>, EVEX_V128; + } +} + +multiclass avx512_movddup<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86SchedWriteWidths sched> { + defm NAME: avx512_movddup_common<opc, OpcodeStr, OpNode, sched, + avx512vl_f64_info>, XD, VEX_W; +} + +defm VMOVDDUP : avx512_movddup<0x12, "vmovddup", X86Movddup, SchedWriteFShuffle>; + +let Predicates = [HasVLX] in { +def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), + (VMOVDDUPZ128rm addr:$src)>; +def : Pat<(v2f64 (X86VBroadcast f64:$src)), + (VMOVDDUPZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>; +def : Pat<(v2f64 (X86VBroadcast (loadv2f64 addr:$src))), + (VMOVDDUPZ128rm addr:$src)>; + +def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)), + (v2f64 VR128X:$src0)), + (VMOVDDUPZ128rrk VR128X:$src0, VK2WM:$mask, + (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>; +def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)), + (bitconvert (v4i32 immAllZerosV))), + (VMOVDDUPZ128rrkz VK2WM:$mask, (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>; + +def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))), + (v2f64 VR128X:$src0)), + (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; +def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))), + (bitconvert (v4i32 immAllZerosV))), + (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>; + +def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadv2f64 addr:$src))), + (v2f64 VR128X:$src0)), + (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; +def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadv2f64 addr:$src))), + (bitconvert (v4i32 immAllZerosV))), + (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>; +} + +//===----------------------------------------------------------------------===// +// AVX-512 - Unpack Instructions +//===----------------------------------------------------------------------===// + +defm VUNPCKH : avx512_fp_binop_p<0x15, "vunpckh", X86Unpckh, HasAVX512, + SchedWriteFShuffleSizes, 0, 1>; +defm VUNPCKL : avx512_fp_binop_p<0x14, "vunpckl", X86Unpckl, HasAVX512, + SchedWriteFShuffleSizes>; + +defm VPUNPCKLBW : avx512_binop_rm_vl_b<0x60, "vpunpcklbw", X86Unpckl, + SchedWriteShuffle, HasBWI>; +defm VPUNPCKHBW : avx512_binop_rm_vl_b<0x68, "vpunpckhbw", X86Unpckh, + SchedWriteShuffle, HasBWI>; +defm VPUNPCKLWD : avx512_binop_rm_vl_w<0x61, "vpunpcklwd", X86Unpckl, + SchedWriteShuffle, HasBWI>; +defm VPUNPCKHWD : avx512_binop_rm_vl_w<0x69, "vpunpckhwd", X86Unpckh, + SchedWriteShuffle, HasBWI>; + +defm VPUNPCKLDQ : avx512_binop_rm_vl_d<0x62, "vpunpckldq", X86Unpckl, + SchedWriteShuffle, HasAVX512>; +defm VPUNPCKHDQ : avx512_binop_rm_vl_d<0x6A, "vpunpckhdq", X86Unpckh, + SchedWriteShuffle, HasAVX512>; +defm VPUNPCKLQDQ : avx512_binop_rm_vl_q<0x6C, "vpunpcklqdq", X86Unpckl, + SchedWriteShuffle, HasAVX512>; +defm VPUNPCKHQDQ : avx512_binop_rm_vl_q<0x6D, "vpunpckhqdq", X86Unpckh, + SchedWriteShuffle, HasAVX512>; + +//===----------------------------------------------------------------------===// +// AVX-512 - Extract & Insert Integer Instructions +//===----------------------------------------------------------------------===// + +multiclass avx512_extract_elt_bw_m<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + def mr : AVX512Ii8<opc, MRMDestMem, (outs), + (ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(store (_.EltVT (trunc (OpNode (_.VT _.RC:$src1), imm:$src2))), + addr:$dst)]>, + EVEX, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteVecExtractSt]>; +} + +multiclass avx512_extract_elt_b<string OpcodeStr, X86VectorVTInfo _> { + let Predicates = [HasBWI] in { + def rr : AVX512Ii8<0x14, MRMDestReg, (outs GR32orGR64:$dst), + (ins _.RC:$src1, u8imm:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32orGR64:$dst, + (X86pextrb (_.VT _.RC:$src1), imm:$src2))]>, + EVEX, TAPD, Sched<[WriteVecExtract]>; + + defm NAME : avx512_extract_elt_bw_m<0x14, OpcodeStr, X86pextrb, _>, TAPD; + } +} + +multiclass avx512_extract_elt_w<string OpcodeStr, X86VectorVTInfo _> { + let Predicates = [HasBWI] in { + def rr : AVX512Ii8<0xC5, MRMSrcReg, (outs GR32orGR64:$dst), + (ins _.RC:$src1, u8imm:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32orGR64:$dst, + (X86pextrw (_.VT _.RC:$src1), imm:$src2))]>, + EVEX, PD, Sched<[WriteVecExtract]>; + + let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in + def rr_REV : AVX512Ii8<0x15, MRMDestReg, (outs GR32orGR64:$dst), + (ins _.RC:$src1, u8imm:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, + EVEX, TAPD, FoldGenData<NAME#rr>, + Sched<[WriteVecExtract]>; + + defm NAME : avx512_extract_elt_bw_m<0x15, OpcodeStr, X86pextrw, _>, TAPD; + } +} + +multiclass avx512_extract_elt_dq<string OpcodeStr, X86VectorVTInfo _, + RegisterClass GRC> { + let Predicates = [HasDQI] in { + def rr : AVX512Ii8<0x16, MRMDestReg, (outs GRC:$dst), + (ins _.RC:$src1, u8imm:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GRC:$dst, + (extractelt (_.VT _.RC:$src1), imm:$src2))]>, + EVEX, TAPD, Sched<[WriteVecExtract]>; + + def mr : AVX512Ii8<0x16, MRMDestMem, (outs), + (ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(store (extractelt (_.VT _.RC:$src1), + imm:$src2),addr:$dst)]>, + EVEX, EVEX_CD8<_.EltSize, CD8VT1>, TAPD, + Sched<[WriteVecExtractSt]>; + } +} + +defm VPEXTRBZ : avx512_extract_elt_b<"vpextrb", v16i8x_info>, VEX_WIG; +defm VPEXTRWZ : avx512_extract_elt_w<"vpextrw", v8i16x_info>, VEX_WIG; +defm VPEXTRDZ : avx512_extract_elt_dq<"vpextrd", v4i32x_info, GR32>; +defm VPEXTRQZ : avx512_extract_elt_dq<"vpextrq", v2i64x_info, GR64>, VEX_W; + +multiclass avx512_insert_elt_m<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _, PatFrag LdFrag> { + def rm : AVX512Ii8<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3), + OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set _.RC:$dst, + (_.VT (OpNode _.RC:$src1, (LdFrag addr:$src2), imm:$src3)))]>, + EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteVecInsertLd, ReadAfterLd]>; +} + +multiclass avx512_insert_elt_bw<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _, PatFrag LdFrag> { + let Predicates = [HasBWI] in { + def rr : AVX512Ii8<opc, MRMSrcReg, (outs _.RC:$dst), + (ins _.RC:$src1, GR32orGR64:$src2, u8imm:$src3), + OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set _.RC:$dst, + (OpNode _.RC:$src1, GR32orGR64:$src2, imm:$src3))]>, EVEX_4V, + Sched<[WriteVecInsert]>; + + defm NAME : avx512_insert_elt_m<opc, OpcodeStr, OpNode, _, LdFrag>; + } +} + +multiclass avx512_insert_elt_dq<bits<8> opc, string OpcodeStr, + X86VectorVTInfo _, RegisterClass GRC> { + let Predicates = [HasDQI] in { + def rr : AVX512Ii8<opc, MRMSrcReg, (outs _.RC:$dst), + (ins _.RC:$src1, GRC:$src2, u8imm:$src3), + OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set _.RC:$dst, + (_.VT (insertelt _.RC:$src1, GRC:$src2, imm:$src3)))]>, + EVEX_4V, TAPD, Sched<[WriteVecInsert]>; + + defm NAME : avx512_insert_elt_m<opc, OpcodeStr, insertelt, _, + _.ScalarLdFrag>, TAPD; + } +} + +defm VPINSRBZ : avx512_insert_elt_bw<0x20, "vpinsrb", X86pinsrb, v16i8x_info, + extloadi8>, TAPD, VEX_WIG; +defm VPINSRWZ : avx512_insert_elt_bw<0xC4, "vpinsrw", X86pinsrw, v8i16x_info, + extloadi16>, PD, VEX_WIG; +defm VPINSRDZ : avx512_insert_elt_dq<0x22, "vpinsrd", v4i32x_info, GR32>; +defm VPINSRQZ : avx512_insert_elt_dq<0x22, "vpinsrq", v2i64x_info, GR64>, VEX_W; + +//===----------------------------------------------------------------------===// +// VSHUFPS - VSHUFPD Operations +//===----------------------------------------------------------------------===// + +multiclass avx512_shufp<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I, + AVX512VLVectorVTInfo VTInfo_FP>{ + defm NAME: avx512_common_3Op_imm8<OpcodeStr, VTInfo_FP, 0xC6, X86Shufp, + SchedWriteFShuffle>, + EVEX_CD8<VTInfo_FP.info512.EltSize, CD8VF>, + AVX512AIi8Base, EVEX_4V; +} + +defm VSHUFPS: avx512_shufp<"vshufps", avx512vl_i32_info, avx512vl_f32_info>, PS; +defm VSHUFPD: avx512_shufp<"vshufpd", avx512vl_i64_info, avx512vl_f64_info>, PD, VEX_W; + +//===----------------------------------------------------------------------===// +// AVX-512 - Byte shift Left/Right +//===----------------------------------------------------------------------===// + +// FIXME: The SSE/AVX names are PSLLDQri etc. - should we add the i here as well? +multiclass avx512_shift_packed<bits<8> opc, SDNode OpNode, Format MRMr, + Format MRMm, string OpcodeStr, + X86FoldableSchedWrite sched, X86VectorVTInfo _>{ + def rr : AVX512<opc, MRMr, + (outs _.RC:$dst), (ins _.RC:$src1, u8imm:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.RC:$dst,(_.VT (OpNode _.RC:$src1, (i8 imm:$src2))))]>, + Sched<[sched]>; + def rm : AVX512<opc, MRMm, + (outs _.RC:$dst), (ins _.MemOp:$src1, u8imm:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.RC:$dst,(_.VT (OpNode + (_.VT (bitconvert (_.LdFrag addr:$src1))), + (i8 imm:$src2))))]>, + Sched<[sched.Folded, ReadAfterLd]>; +} + +multiclass avx512_shift_packed_all<bits<8> opc, SDNode OpNode, Format MRMr, + Format MRMm, string OpcodeStr, + X86SchedWriteWidths sched, Predicate prd>{ + let Predicates = [prd] in + defm Z : avx512_shift_packed<opc, OpNode, MRMr, MRMm, OpcodeStr, + sched.ZMM, v64i8_info>, EVEX_V512; + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_shift_packed<opc, OpNode, MRMr, MRMm, OpcodeStr, + sched.YMM, v32i8x_info>, EVEX_V256; + defm Z128 : avx512_shift_packed<opc, OpNode, MRMr, MRMm, OpcodeStr, + sched.XMM, v16i8x_info>, EVEX_V128; + } +} +defm VPSLLDQ : avx512_shift_packed_all<0x73, X86vshldq, MRM7r, MRM7m, "vpslldq", + SchedWriteShuffle, HasBWI>, + AVX512PDIi8Base, EVEX_4V, VEX_WIG; +defm VPSRLDQ : avx512_shift_packed_all<0x73, X86vshrdq, MRM3r, MRM3m, "vpsrldq", + SchedWriteShuffle, HasBWI>, + AVX512PDIi8Base, EVEX_4V, VEX_WIG; + +multiclass avx512_psadbw_packed<bits<8> opc, SDNode OpNode, + string OpcodeStr, X86FoldableSchedWrite sched, + X86VectorVTInfo _dst, X86VectorVTInfo _src> { + def rr : AVX512BI<opc, MRMSrcReg, + (outs _dst.RC:$dst), (ins _src.RC:$src1, _src.RC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _dst.RC:$dst,(_dst.VT + (OpNode (_src.VT _src.RC:$src1), + (_src.VT _src.RC:$src2))))]>, + Sched<[sched]>; + def rm : AVX512BI<opc, MRMSrcMem, + (outs _dst.RC:$dst), (ins _src.RC:$src1, _src.MemOp:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _dst.RC:$dst,(_dst.VT + (OpNode (_src.VT _src.RC:$src1), + (_src.VT (bitconvert + (_src.LdFrag addr:$src2))))))]>, + Sched<[sched.Folded, ReadAfterLd]>; +} + +multiclass avx512_psadbw_packed_all<bits<8> opc, SDNode OpNode, + string OpcodeStr, X86SchedWriteWidths sched, + Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_psadbw_packed<opc, OpNode, OpcodeStr, sched.ZMM, + v8i64_info, v64i8_info>, EVEX_V512; + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, sched.YMM, + v4i64x_info, v32i8x_info>, EVEX_V256; + defm Z128 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, sched.XMM, + v2i64x_info, v16i8x_info>, EVEX_V128; + } +} + +defm VPSADBW : avx512_psadbw_packed_all<0xf6, X86psadbw, "vpsadbw", + SchedWritePSADBW, HasBWI>, EVEX_4V, VEX_WIG; + +// Transforms to swizzle an immediate to enable better matching when +// memory operand isn't in the right place. +def VPTERNLOG321_imm8 : SDNodeXForm<imm, [{ + // Convert a VPTERNLOG immediate by swapping operand 0 and operand 2. + uint8_t Imm = N->getZExtValue(); + // Swap bits 1/4 and 3/6. + uint8_t NewImm = Imm & 0xa5; + if (Imm & 0x02) NewImm |= 0x10; + if (Imm & 0x10) NewImm |= 0x02; + if (Imm & 0x08) NewImm |= 0x40; + if (Imm & 0x40) NewImm |= 0x08; + return getI8Imm(NewImm, SDLoc(N)); +}]>; +def VPTERNLOG213_imm8 : SDNodeXForm<imm, [{ + // Convert a VPTERNLOG immediate by swapping operand 1 and operand 2. + uint8_t Imm = N->getZExtValue(); + // Swap bits 2/4 and 3/5. + uint8_t NewImm = Imm & 0xc3; + if (Imm & 0x04) NewImm |= 0x10; + if (Imm & 0x10) NewImm |= 0x04; + if (Imm & 0x08) NewImm |= 0x20; + if (Imm & 0x20) NewImm |= 0x08; + return getI8Imm(NewImm, SDLoc(N)); +}]>; +def VPTERNLOG132_imm8 : SDNodeXForm<imm, [{ + // Convert a VPTERNLOG immediate by swapping operand 1 and operand 2. + uint8_t Imm = N->getZExtValue(); + // Swap bits 1/2 and 5/6. + uint8_t NewImm = Imm & 0x99; + if (Imm & 0x02) NewImm |= 0x04; + if (Imm & 0x04) NewImm |= 0x02; + if (Imm & 0x20) NewImm |= 0x40; + if (Imm & 0x40) NewImm |= 0x20; + return getI8Imm(NewImm, SDLoc(N)); +}]>; +def VPTERNLOG231_imm8 : SDNodeXForm<imm, [{ + // Convert a VPTERNLOG immediate by moving operand 1 to the end. + uint8_t Imm = N->getZExtValue(); + // Move bits 1->2, 2->4, 3->6, 4->1, 5->3, 6->5 + uint8_t NewImm = Imm & 0x81; + if (Imm & 0x02) NewImm |= 0x04; + if (Imm & 0x04) NewImm |= 0x10; + if (Imm & 0x08) NewImm |= 0x40; + if (Imm & 0x10) NewImm |= 0x02; + if (Imm & 0x20) NewImm |= 0x08; + if (Imm & 0x40) NewImm |= 0x20; + return getI8Imm(NewImm, SDLoc(N)); +}]>; +def VPTERNLOG312_imm8 : SDNodeXForm<imm, [{ + // Convert a VPTERNLOG immediate by moving operand 2 to the beginning. + uint8_t Imm = N->getZExtValue(); + // Move bits 1->4, 2->1, 3->5, 4->2, 5->6, 6->3 + uint8_t NewImm = Imm & 0x81; + if (Imm & 0x02) NewImm |= 0x10; + if (Imm & 0x04) NewImm |= 0x02; + if (Imm & 0x08) NewImm |= 0x20; + if (Imm & 0x10) NewImm |= 0x04; + if (Imm & 0x20) NewImm |= 0x40; + if (Imm & 0x40) NewImm |= 0x08; + return getI8Imm(NewImm, SDLoc(N)); +}]>; + +multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, X86VectorVTInfo _, + string Name>{ + let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in { + defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.RC:$src3, u8imm:$src4), + OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src4", + (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (_.VT _.RC:$src3), + (i8 imm:$src4)), 1, 1>, + AVX512AIi8Base, EVEX_4V, Sched<[sched]>; + defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.MemOp:$src3, u8imm:$src4), + OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src4", + (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (_.VT (bitconvert (_.LdFrag addr:$src3))), + (i8 imm:$src4)), 1, 0>, + AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>, + Sched<[sched.Folded, ReadAfterLd]>; + defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.ScalarMemOp:$src3, u8imm:$src4), + OpcodeStr, "$src4, ${src3}"##_.BroadcastStr##", $src2", + "$src2, ${src3}"##_.BroadcastStr##", $src4", + (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))), + (i8 imm:$src4)), 1, 0>, EVEX_B, + AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>, + Sched<[sched.Folded, ReadAfterLd]>; + }// Constraints = "$src1 = $dst" + + // Additional patterns for matching passthru operand in other positions. + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode _.RC:$src3, _.RC:$src2, _.RC:$src1, (i8 imm:$src4)), + _.RC:$src1)), + (!cast<Instruction>(Name#_.ZSuffix#rrik) _.RC:$src1, _.KRCWM:$mask, + _.RC:$src2, _.RC:$src3, (VPTERNLOG321_imm8 imm:$src4))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i8 imm:$src4)), + _.RC:$src1)), + (!cast<Instruction>(Name#_.ZSuffix#rrik) _.RC:$src1, _.KRCWM:$mask, + _.RC:$src2, _.RC:$src3, (VPTERNLOG213_imm8 imm:$src4))>; + + // Additional patterns for matching loads in other positions. + def : Pat<(_.VT (OpNode (bitconvert (_.LdFrag addr:$src3)), + _.RC:$src2, _.RC:$src1, (i8 imm:$src4))), + (!cast<Instruction>(Name#_.ZSuffix#rmi) _.RC:$src1, _.RC:$src2, + addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>; + def : Pat<(_.VT (OpNode _.RC:$src1, + (bitconvert (_.LdFrag addr:$src3)), + _.RC:$src2, (i8 imm:$src4))), + (!cast<Instruction>(Name#_.ZSuffix#rmi) _.RC:$src1, _.RC:$src2, + addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>; + + // Additional patterns for matching zero masking with loads in other + // positions. + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode (bitconvert (_.LdFrag addr:$src3)), + _.RC:$src2, _.RC:$src1, (i8 imm:$src4)), + _.ImmAllZerosV)), + (!cast<Instruction>(Name#_.ZSuffix#rmikz) _.RC:$src1, _.KRCWM:$mask, + _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src3)), + _.RC:$src2, (i8 imm:$src4)), + _.ImmAllZerosV)), + (!cast<Instruction>(Name#_.ZSuffix#rmikz) _.RC:$src1, _.KRCWM:$mask, + _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>; + + // Additional patterns for matching masked loads with different + // operand orders. + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src3)), + _.RC:$src2, (i8 imm:$src4)), + _.RC:$src1)), + (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask, + _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode (bitconvert (_.LdFrag addr:$src3)), + _.RC:$src2, _.RC:$src1, (i8 imm:$src4)), + _.RC:$src1)), + (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask, + _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode _.RC:$src2, _.RC:$src1, + (bitconvert (_.LdFrag addr:$src3)), (i8 imm:$src4)), + _.RC:$src1)), + (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask, + _.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 imm:$src4))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode _.RC:$src2, (bitconvert (_.LdFrag addr:$src3)), + _.RC:$src1, (i8 imm:$src4)), + _.RC:$src1)), + (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask, + _.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 imm:$src4))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode (bitconvert (_.LdFrag addr:$src3)), + _.RC:$src1, _.RC:$src2, (i8 imm:$src4)), + _.RC:$src1)), + (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask, + _.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 imm:$src4))>; + + // Additional patterns for matching broadcasts in other positions. + def : Pat<(_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + _.RC:$src2, _.RC:$src1, (i8 imm:$src4))), + (!cast<Instruction>(Name#_.ZSuffix#rmbi) _.RC:$src1, _.RC:$src2, + addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>; + def : Pat<(_.VT (OpNode _.RC:$src1, + (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + _.RC:$src2, (i8 imm:$src4))), + (!cast<Instruction>(Name#_.ZSuffix#rmbi) _.RC:$src1, _.RC:$src2, + addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>; + + // Additional patterns for matching zero masking with broadcasts in other + // positions. + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + _.RC:$src2, _.RC:$src1, (i8 imm:$src4)), + _.ImmAllZerosV)), + (!cast<Instruction>(Name#_.ZSuffix#rmbikz) _.RC:$src1, + _.KRCWM:$mask, _.RC:$src2, addr:$src3, + (VPTERNLOG321_imm8 imm:$src4))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode _.RC:$src1, + (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + _.RC:$src2, (i8 imm:$src4)), + _.ImmAllZerosV)), + (!cast<Instruction>(Name#_.ZSuffix#rmbikz) _.RC:$src1, + _.KRCWM:$mask, _.RC:$src2, addr:$src3, + (VPTERNLOG132_imm8 imm:$src4))>; + + // Additional patterns for matching masked broadcasts with different + // operand orders. + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode _.RC:$src1, + (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + _.RC:$src2, (i8 imm:$src4)), + _.RC:$src1)), + (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask, + _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + _.RC:$src2, _.RC:$src1, (i8 imm:$src4)), + _.RC:$src1)), + (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask, + _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode _.RC:$src2, _.RC:$src1, + (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + (i8 imm:$src4)), _.RC:$src1)), + (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask, + _.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 imm:$src4))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode _.RC:$src2, + (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + _.RC:$src1, (i8 imm:$src4)), + _.RC:$src1)), + (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask, + _.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 imm:$src4))>; + def : Pat<(_.VT (vselect _.KRCWM:$mask, + (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)), + _.RC:$src1, _.RC:$src2, (i8 imm:$src4)), + _.RC:$src1)), + (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask, + _.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 imm:$src4))>; +} + +multiclass avx512_common_ternlog<string OpcodeStr, X86SchedWriteWidths sched, + AVX512VLVectorVTInfo _> { + let Predicates = [HasAVX512] in + defm Z : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, sched.ZMM, + _.info512, NAME>, EVEX_V512; + let Predicates = [HasAVX512, HasVLX] in { + defm Z128 : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, sched.XMM, + _.info128, NAME>, EVEX_V128; + defm Z256 : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, sched.YMM, + _.info256, NAME>, EVEX_V256; + } +} + +defm VPTERNLOGD : avx512_common_ternlog<"vpternlogd", SchedWriteVecALU, + avx512vl_i32_info>; +defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", SchedWriteVecALU, + avx512vl_i64_info>, VEX_W; + +// Patterns to implement vnot using vpternlog instead of creating all ones +// using pcmpeq or vpternlog and then xoring with that. The value 15 is chosen +// so that the result is only dependent on src0. But we use the same source +// for all operands to prevent a false dependency. +// TODO: We should maybe have a more generalized algorithm for folding to +// vpternlog. +let Predicates = [HasAVX512] in { + def : Pat<(v8i64 (xor VR512:$src, (bc_v8i64 (v16i32 immAllOnesV)))), + (VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>; +} + +let Predicates = [HasAVX512, NoVLX] in { + def : Pat<(v2i64 (xor VR128X:$src, (bc_v2i64 (v4i32 immAllOnesV)))), + (EXTRACT_SUBREG + (VPTERNLOGQZrri + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (i8 15)), sub_xmm)>; + def : Pat<(v4i64 (xor VR256X:$src, (bc_v4i64 (v8i32 immAllOnesV)))), + (EXTRACT_SUBREG + (VPTERNLOGQZrri + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (i8 15)), sub_ymm)>; +} + +let Predicates = [HasVLX] in { + def : Pat<(v2i64 (xor VR128X:$src, (bc_v2i64 (v4i32 immAllOnesV)))), + (VPTERNLOGQZ128rri VR128X:$src, VR128X:$src, VR128X:$src, (i8 15))>; + def : Pat<(v4i64 (xor VR256X:$src, (bc_v4i64 (v8i32 immAllOnesV)))), + (VPTERNLOGQZ256rri VR256X:$src, VR256X:$src, VR256X:$src, (i8 15))>; +} + +//===----------------------------------------------------------------------===// +// AVX-512 - FixupImm +//===----------------------------------------------------------------------===// + +multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, X86VectorVTInfo _, + X86VectorVTInfo TblVT>{ + let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in { + defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4), + OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4", + (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (TblVT.VT _.RC:$src3), + (i32 imm:$src4), + (i32 FROUND_CURRENT))>, Sched<[sched]>; + defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.MemOp:$src3, i32u8imm:$src4), + OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4", + (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (TblVT.VT (bitconvert (TblVT.LdFrag addr:$src3))), + (i32 imm:$src4), + (i32 FROUND_CURRENT))>, + Sched<[sched.Folded, ReadAfterLd]>; + defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4), + OpcodeStr##_.Suffix, "$src4, ${src3}"##_.BroadcastStr##", $src2", + "$src2, ${src3}"##_.BroadcastStr##", $src4", + (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (TblVT.VT (X86VBroadcast(TblVT.ScalarLdFrag addr:$src3))), + (i32 imm:$src4), + (i32 FROUND_CURRENT))>, + EVEX_B, Sched<[sched.Folded, ReadAfterLd]>; + } // Constraints = "$src1 = $dst" +} + +multiclass avx512_fixupimm_packed_sae<bits<8> opc, string OpcodeStr, + SDNode OpNode, X86FoldableSchedWrite sched, + X86VectorVTInfo _, X86VectorVTInfo TblVT>{ +let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in { + defm rrib : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4), + OpcodeStr##_.Suffix, "$src4, {sae}, $src3, $src2", + "$src2, $src3, {sae}, $src4", + (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (TblVT.VT _.RC:$src3), + (i32 imm:$src4), + (i32 FROUND_NO_EXC))>, + EVEX_B, Sched<[sched]>; + } +} + +multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86FoldableSchedWrite sched, X86VectorVTInfo _, + X86VectorVTInfo _src3VT> { + let Constraints = "$src1 = $dst" , Predicates = [HasAVX512], + ExeDomain = _.ExeDomain in { + defm rri : AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4), + OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4", + (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (_src3VT.VT _src3VT.RC:$src3), + (i32 imm:$src4), + (i32 FROUND_CURRENT))>, Sched<[sched]>; + defm rrib : AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4), + OpcodeStr##_.Suffix, "$src4, {sae}, $src3, $src2", + "$src2, $src3, {sae}, $src4", + (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (_src3VT.VT _src3VT.RC:$src3), + (i32 imm:$src4), + (i32 FROUND_NO_EXC))>, + EVEX_B, Sched<[sched.Folded, ReadAfterLd]>; + defm rmi : AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4), + OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4", + (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (_src3VT.VT (scalar_to_vector + (_src3VT.ScalarLdFrag addr:$src3))), + (i32 imm:$src4), + (i32 FROUND_CURRENT))>, + Sched<[sched.Folded, ReadAfterLd]>; + } +} + +multiclass avx512_fixupimm_packed_all<X86SchedWriteWidths sched, + AVX512VLVectorVTInfo _Vec, + AVX512VLVectorVTInfo _Tbl> { + let Predicates = [HasAVX512] in + defm Z : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, sched.ZMM, + _Vec.info512, _Tbl.info512>, + avx512_fixupimm_packed_sae<0x54, "vfixupimm", X86VFixupimm, sched.ZMM, + _Vec.info512, _Tbl.info512>, AVX512AIi8Base, + EVEX_4V, EVEX_V512; + let Predicates = [HasAVX512, HasVLX] in { + defm Z128 : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, sched.XMM, + _Vec.info128, _Tbl.info128>, AVX512AIi8Base, + EVEX_4V, EVEX_V128; + defm Z256 : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, sched.YMM, + _Vec.info256, _Tbl.info256>, AVX512AIi8Base, + EVEX_4V, EVEX_V256; + } +} + +defm VFIXUPIMMSSZ : avx512_fixupimm_scalar<0x55, "vfixupimm", X86VFixupimmScalar, + SchedWriteFAdd.Scl, f32x_info, v4i32x_info>, + AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>; +defm VFIXUPIMMSDZ : avx512_fixupimm_scalar<0x55, "vfixupimm", X86VFixupimmScalar, + SchedWriteFAdd.Scl, f64x_info, v2i64x_info>, + AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W; +defm VFIXUPIMMPS : avx512_fixupimm_packed_all<SchedWriteFAdd, avx512vl_f32_info, + avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; +defm VFIXUPIMMPD : avx512_fixupimm_packed_all<SchedWriteFAdd, avx512vl_f64_info, + avx512vl_i64_info>, EVEX_CD8<64, CD8VF>, VEX_W; + +// Patterns used to select SSE scalar fp arithmetic instructions from +// either: +// +// (1) a scalar fp operation followed by a blend +// +// The effect is that the backend no longer emits unnecessary vector +// insert instructions immediately after SSE scalar fp instructions +// like addss or mulss. +// +// For example, given the following code: +// __m128 foo(__m128 A, __m128 B) { +// A[0] += B[0]; +// return A; +// } +// +// Previously we generated: +// addss %xmm0, %xmm1 +// movss %xmm1, %xmm0 +// +// We now generate: +// addss %xmm1, %xmm0 +// +// (2) a vector packed single/double fp operation followed by a vector insert +// +// The effect is that the backend converts the packed fp instruction +// followed by a vector insert into a single SSE scalar fp instruction. +// +// For example, given the following code: +// __m128 foo(__m128 A, __m128 B) { +// __m128 C = A + B; +// return (__m128) {c[0], a[1], a[2], a[3]}; +// } +// +// Previously we generated: +// addps %xmm0, %xmm1 +// movss %xmm1, %xmm0 +// +// We now generate: +// addss %xmm1, %xmm0 + +// TODO: Some canonicalization in lowering would simplify the number of +// patterns we have to try to match. +multiclass AVX512_scalar_math_fp_patterns<SDNode Op, string OpcPrefix, SDNode MoveNode, + X86VectorVTInfo _, PatLeaf ZeroFP> { + let Predicates = [HasAVX512] in { + // extracted scalar math op with insert via movss + def : Pat<(MoveNode + (_.VT VR128X:$dst), + (_.VT (scalar_to_vector + (Op (_.EltVT (extractelt (_.VT VR128X:$dst), (iPTR 0))), + _.FRC:$src)))), + (!cast<Instruction>("V"#OpcPrefix#Zrr_Int) _.VT:$dst, + (_.VT (COPY_TO_REGCLASS _.FRC:$src, VR128X)))>; + + // extracted masked scalar math op with insert via movss + def : Pat<(MoveNode (_.VT VR128X:$src1), + (scalar_to_vector + (X86selects VK1WM:$mask, + (Op (_.EltVT + (extractelt (_.VT VR128X:$src1), (iPTR 0))), + _.FRC:$src2), + _.FRC:$src0))), + (!cast<Instruction>("V"#OpcPrefix#Zrr_Intk) + (_.VT (COPY_TO_REGCLASS _.FRC:$src0, VR128X)), + VK1WM:$mask, _.VT:$src1, + (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)))>; + + // extracted masked scalar math op with insert via movss + def : Pat<(MoveNode (_.VT VR128X:$src1), + (scalar_to_vector + (X86selects VK1WM:$mask, + (Op (_.EltVT + (extractelt (_.VT VR128X:$src1), (iPTR 0))), + _.FRC:$src2), (_.EltVT ZeroFP)))), + (!cast<I>("V"#OpcPrefix#Zrr_Intkz) + VK1WM:$mask, _.VT:$src1, + (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)))>; + } +} + +defm : AVX512_scalar_math_fp_patterns<fadd, "ADDSS", X86Movss, v4f32x_info, fp32imm0>; +defm : AVX512_scalar_math_fp_patterns<fsub, "SUBSS", X86Movss, v4f32x_info, fp32imm0>; +defm : AVX512_scalar_math_fp_patterns<fmul, "MULSS", X86Movss, v4f32x_info, fp32imm0>; +defm : AVX512_scalar_math_fp_patterns<fdiv, "DIVSS", X86Movss, v4f32x_info, fp32imm0>; + +defm : AVX512_scalar_math_fp_patterns<fadd, "ADDSD", X86Movsd, v2f64x_info, fp64imm0>; +defm : AVX512_scalar_math_fp_patterns<fsub, "SUBSD", X86Movsd, v2f64x_info, fp64imm0>; +defm : AVX512_scalar_math_fp_patterns<fmul, "MULSD", X86Movsd, v2f64x_info, fp64imm0>; +defm : AVX512_scalar_math_fp_patterns<fdiv, "DIVSD", X86Movsd, v2f64x_info, fp64imm0>; + +multiclass AVX512_scalar_unary_math_patterns<SDNode OpNode, string OpcPrefix, + SDNode Move, X86VectorVTInfo _> { + let Predicates = [HasAVX512] in { + def : Pat<(_.VT (Move _.VT:$dst, + (scalar_to_vector (OpNode (extractelt _.VT:$src, 0))))), + (!cast<Instruction>("V"#OpcPrefix#Zr_Int) _.VT:$dst, _.VT:$src)>; + } +} + +defm : AVX512_scalar_unary_math_patterns<fsqrt, "SQRTSS", X86Movss, v4f32x_info>; +defm : AVX512_scalar_unary_math_patterns<fsqrt, "SQRTSD", X86Movsd, v2f64x_info>; + +multiclass AVX512_scalar_unary_math_imm_patterns<SDNode OpNode, string OpcPrefix, + SDNode Move, X86VectorVTInfo _, + bits<8> ImmV> { + let Predicates = [HasAVX512] in { + def : Pat<(_.VT (Move _.VT:$dst, + (scalar_to_vector (OpNode (extractelt _.VT:$src, 0))))), + (!cast<Instruction>("V"#OpcPrefix#Zr_Int) _.VT:$dst, _.VT:$src, + (i32 ImmV))>; + } +} + +defm : AVX512_scalar_unary_math_imm_patterns<ffloor, "RNDSCALESS", X86Movss, + v4f32x_info, 0x01>; +defm : AVX512_scalar_unary_math_imm_patterns<fceil, "RNDSCALESS", X86Movss, + v4f32x_info, 0x02>; +defm : AVX512_scalar_unary_math_imm_patterns<ffloor, "RNDSCALESD", X86Movsd, + v2f64x_info, 0x01>; +defm : AVX512_scalar_unary_math_imm_patterns<fceil, "RNDSCALESD", X86Movsd, + v2f64x_info, 0x02>; + +//===----------------------------------------------------------------------===// +// AES instructions +//===----------------------------------------------------------------------===// + +multiclass avx512_vaes<bits<8> Op, string OpStr, string IntPrefix> { + let Predicates = [HasVLX, HasVAES] in { + defm Z128 : AESI_binop_rm_int<Op, OpStr, + !cast<Intrinsic>(IntPrefix), + loadv2i64, 0, VR128X, i128mem>, + EVEX_4V, EVEX_CD8<64, CD8VF>, EVEX_V128, VEX_WIG; + defm Z256 : AESI_binop_rm_int<Op, OpStr, + !cast<Intrinsic>(IntPrefix##"_256"), + loadv4i64, 0, VR256X, i256mem>, + EVEX_4V, EVEX_CD8<64, CD8VF>, EVEX_V256, VEX_WIG; + } + let Predicates = [HasAVX512, HasVAES] in + defm Z : AESI_binop_rm_int<Op, OpStr, + !cast<Intrinsic>(IntPrefix##"_512"), + loadv8i64, 0, VR512, i512mem>, + EVEX_4V, EVEX_CD8<64, CD8VF>, EVEX_V512, VEX_WIG; +} + +defm VAESENC : avx512_vaes<0xDC, "vaesenc", "int_x86_aesni_aesenc">; +defm VAESENCLAST : avx512_vaes<0xDD, "vaesenclast", "int_x86_aesni_aesenclast">; +defm VAESDEC : avx512_vaes<0xDE, "vaesdec", "int_x86_aesni_aesdec">; +defm VAESDECLAST : avx512_vaes<0xDF, "vaesdeclast", "int_x86_aesni_aesdeclast">; + +//===----------------------------------------------------------------------===// +// PCLMUL instructions - Carry less multiplication +//===----------------------------------------------------------------------===// + +let Predicates = [HasAVX512, HasVPCLMULQDQ] in +defm VPCLMULQDQZ : vpclmulqdq<VR512, i512mem, loadv8i64, int_x86_pclmulqdq_512>, + EVEX_4V, EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_WIG; + +let Predicates = [HasVLX, HasVPCLMULQDQ] in { +defm VPCLMULQDQZ128 : vpclmulqdq<VR128X, i128mem, loadv2i64, int_x86_pclmulqdq>, + EVEX_4V, EVEX_V128, EVEX_CD8<64, CD8VF>, VEX_WIG; + +defm VPCLMULQDQZ256: vpclmulqdq<VR256X, i256mem, loadv4i64, + int_x86_pclmulqdq_256>, EVEX_4V, EVEX_V256, + EVEX_CD8<64, CD8VF>, VEX_WIG; +} + +// Aliases +defm : vpclmulqdq_aliases<"VPCLMULQDQZ", VR512, i512mem>; +defm : vpclmulqdq_aliases<"VPCLMULQDQZ128", VR128X, i128mem>; +defm : vpclmulqdq_aliases<"VPCLMULQDQZ256", VR256X, i256mem>; + +//===----------------------------------------------------------------------===// +// VBMI2 +//===----------------------------------------------------------------------===// + +multiclass VBMI2_shift_var_rm<bits<8> Op, string OpStr, SDNode OpNode, + X86FoldableSchedWrite sched, X86VectorVTInfo VTI> { + let Constraints = "$src1 = $dst", + ExeDomain = VTI.ExeDomain in { + defm r: AVX512_maskable_3src<Op, MRMSrcReg, VTI, (outs VTI.RC:$dst), + (ins VTI.RC:$src2, VTI.RC:$src3), OpStr, + "$src3, $src2", "$src2, $src3", + (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2, VTI.RC:$src3))>, + AVX512FMA3Base, Sched<[sched]>; + defm m: AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst), + (ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr, + "$src3, $src2", "$src2, $src3", + (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2, + (VTI.VT (bitconvert (VTI.LdFrag addr:$src3)))))>, + AVX512FMA3Base, + Sched<[sched.Folded, ReadAfterLd]>; + } +} + +multiclass VBMI2_shift_var_rmb<bits<8> Op, string OpStr, SDNode OpNode, + X86FoldableSchedWrite sched, X86VectorVTInfo VTI> + : VBMI2_shift_var_rm<Op, OpStr, OpNode, sched, VTI> { + let Constraints = "$src1 = $dst", + ExeDomain = VTI.ExeDomain in + defm mb: AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst), + (ins VTI.RC:$src2, VTI.ScalarMemOp:$src3), OpStr, + "${src3}"##VTI.BroadcastStr##", $src2", + "$src2, ${src3}"##VTI.BroadcastStr, + (OpNode VTI.RC:$src1, VTI.RC:$src2, + (VTI.VT (X86VBroadcast (VTI.ScalarLdFrag addr:$src3))))>, + AVX512FMA3Base, EVEX_B, + Sched<[sched.Folded, ReadAfterLd]>; +} + +multiclass VBMI2_shift_var_rm_common<bits<8> Op, string OpStr, SDNode OpNode, + X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTI> { + let Predicates = [HasVBMI2] in + defm Z : VBMI2_shift_var_rm<Op, OpStr, OpNode, sched.ZMM, VTI.info512>, + EVEX_V512; + let Predicates = [HasVBMI2, HasVLX] in { + defm Z256 : VBMI2_shift_var_rm<Op, OpStr, OpNode, sched.YMM, VTI.info256>, + EVEX_V256; + defm Z128 : VBMI2_shift_var_rm<Op, OpStr, OpNode, sched.XMM, VTI.info128>, + EVEX_V128; + } +} + +multiclass VBMI2_shift_var_rmb_common<bits<8> Op, string OpStr, SDNode OpNode, + X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTI> { + let Predicates = [HasVBMI2] in + defm Z : VBMI2_shift_var_rmb<Op, OpStr, OpNode, sched.ZMM, VTI.info512>, + EVEX_V512; + let Predicates = [HasVBMI2, HasVLX] in { + defm Z256 : VBMI2_shift_var_rmb<Op, OpStr, OpNode, sched.YMM, VTI.info256>, + EVEX_V256; + defm Z128 : VBMI2_shift_var_rmb<Op, OpStr, OpNode, sched.XMM, VTI.info128>, + EVEX_V128; + } +} +multiclass VBMI2_shift_var<bits<8> wOp, bits<8> dqOp, string Prefix, + SDNode OpNode, X86SchedWriteWidths sched> { + defm W : VBMI2_shift_var_rm_common<wOp, Prefix##"w", OpNode, sched, + avx512vl_i16_info>, VEX_W, EVEX_CD8<16, CD8VF>; + defm D : VBMI2_shift_var_rmb_common<dqOp, Prefix##"d", OpNode, sched, + avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; + defm Q : VBMI2_shift_var_rmb_common<dqOp, Prefix##"q", OpNode, sched, + avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>; +} + +multiclass VBMI2_shift_imm<bits<8> wOp, bits<8> dqOp, string Prefix, + SDNode OpNode, X86SchedWriteWidths sched> { + defm W : avx512_common_3Op_rm_imm8<wOp, OpNode, Prefix##"w", sched, + avx512vl_i16_info, avx512vl_i16_info, HasVBMI2>, + VEX_W, EVEX_CD8<16, CD8VF>; + defm D : avx512_common_3Op_imm8<Prefix##"d", avx512vl_i32_info, dqOp, + OpNode, sched, HasVBMI2>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>; + defm Q : avx512_common_3Op_imm8<Prefix##"q", avx512vl_i64_info, dqOp, OpNode, + sched, HasVBMI2>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W; +} + +// Concat & Shift +defm VPSHLDV : VBMI2_shift_var<0x70, 0x71, "vpshldv", X86VShldv, SchedWriteVecIMul>; +defm VPSHRDV : VBMI2_shift_var<0x72, 0x73, "vpshrdv", X86VShrdv, SchedWriteVecIMul>; +defm VPSHLD : VBMI2_shift_imm<0x70, 0x71, "vpshld", X86VShld, SchedWriteVecIMul>; +defm VPSHRD : VBMI2_shift_imm<0x72, 0x73, "vpshrd", X86VShrd, SchedWriteVecIMul>; + +// Compress +defm VPCOMPRESSB : compress_by_elt_width<0x63, "vpcompressb", WriteVarShuffle256, + avx512vl_i8_info, HasVBMI2>, EVEX, + NotMemoryFoldable; +defm VPCOMPRESSW : compress_by_elt_width <0x63, "vpcompressw", WriteVarShuffle256, + avx512vl_i16_info, HasVBMI2>, EVEX, VEX_W, + NotMemoryFoldable; +// Expand +defm VPEXPANDB : expand_by_elt_width <0x62, "vpexpandb", WriteVarShuffle256, + avx512vl_i8_info, HasVBMI2>, EVEX; +defm VPEXPANDW : expand_by_elt_width <0x62, "vpexpandw", WriteVarShuffle256, + avx512vl_i16_info, HasVBMI2>, EVEX, VEX_W; + +//===----------------------------------------------------------------------===// +// VNNI +//===----------------------------------------------------------------------===// + +let Constraints = "$src1 = $dst" in +multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode, + X86FoldableSchedWrite sched, X86VectorVTInfo VTI> { + defm r : AVX512_maskable_3src<Op, MRMSrcReg, VTI, (outs VTI.RC:$dst), + (ins VTI.RC:$src2, VTI.RC:$src3), OpStr, + "$src3, $src2", "$src2, $src3", + (VTI.VT (OpNode VTI.RC:$src1, + VTI.RC:$src2, VTI.RC:$src3))>, + EVEX_4V, T8PD, Sched<[sched]>; + defm m : AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst), + (ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr, + "$src3, $src2", "$src2, $src3", + (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2, + (VTI.VT (bitconvert + (VTI.LdFrag addr:$src3)))))>, + EVEX_4V, EVEX_CD8<32, CD8VF>, T8PD, + Sched<[sched.Folded, ReadAfterLd]>; + defm mb : AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst), + (ins VTI.RC:$src2, VTI.ScalarMemOp:$src3), + OpStr, "${src3}"##VTI.BroadcastStr##", $src2", + "$src2, ${src3}"##VTI.BroadcastStr, + (OpNode VTI.RC:$src1, VTI.RC:$src2, + (VTI.VT (X86VBroadcast + (VTI.ScalarLdFrag addr:$src3))))>, + EVEX_4V, EVEX_CD8<32, CD8VF>, EVEX_B, + T8PD, Sched<[sched.Folded, ReadAfterLd]>; +} + +multiclass VNNI_common<bits<8> Op, string OpStr, SDNode OpNode, + X86SchedWriteWidths sched> { + let Predicates = [HasVNNI] in + defm Z : VNNI_rmb<Op, OpStr, OpNode, sched.ZMM, v16i32_info>, EVEX_V512; + let Predicates = [HasVNNI, HasVLX] in { + defm Z256 : VNNI_rmb<Op, OpStr, OpNode, sched.YMM, v8i32x_info>, EVEX_V256; + defm Z128 : VNNI_rmb<Op, OpStr, OpNode, sched.XMM, v4i32x_info>, EVEX_V128; + } +} + +// FIXME: Is there a better scheduler class for VPDP? +defm VPDPBUSD : VNNI_common<0x50, "vpdpbusd", X86Vpdpbusd, SchedWriteVecIMul>; +defm VPDPBUSDS : VNNI_common<0x51, "vpdpbusds", X86Vpdpbusds, SchedWriteVecIMul>; +defm VPDPWSSD : VNNI_common<0x52, "vpdpwssd", X86Vpdpwssd, SchedWriteVecIMul>; +defm VPDPWSSDS : VNNI_common<0x53, "vpdpwssds", X86Vpdpwssds, SchedWriteVecIMul>; + +//===----------------------------------------------------------------------===// +// Bit Algorithms +//===----------------------------------------------------------------------===// + +// FIXME: Is there a better scheduler class for VPOPCNTB/VPOPCNTW? +defm VPOPCNTB : avx512_unary_rm_vl<0x54, "vpopcntb", ctpop, SchedWriteVecALU, + avx512vl_i8_info, HasBITALG>; +defm VPOPCNTW : avx512_unary_rm_vl<0x54, "vpopcntw", ctpop, SchedWriteVecALU, + avx512vl_i16_info, HasBITALG>, VEX_W; + +defm : avx512_unary_lowering<"VPOPCNTB", ctpop, avx512vl_i8_info, HasBITALG>; +defm : avx512_unary_lowering<"VPOPCNTW", ctpop, avx512vl_i16_info, HasBITALG>; + +multiclass VPSHUFBITQMB_rm<X86FoldableSchedWrite sched, X86VectorVTInfo VTI> { + defm rr : AVX512_maskable_cmp<0x8F, MRMSrcReg, VTI, (outs VTI.KRC:$dst), + (ins VTI.RC:$src1, VTI.RC:$src2), + "vpshufbitqmb", + "$src2, $src1", "$src1, $src2", + (X86Vpshufbitqmb (VTI.VT VTI.RC:$src1), + (VTI.VT VTI.RC:$src2))>, EVEX_4V, T8PD, + Sched<[sched]>; + defm rm : AVX512_maskable_cmp<0x8F, MRMSrcMem, VTI, (outs VTI.KRC:$dst), + (ins VTI.RC:$src1, VTI.MemOp:$src2), + "vpshufbitqmb", + "$src2, $src1", "$src1, $src2", + (X86Vpshufbitqmb (VTI.VT VTI.RC:$src1), + (VTI.VT (bitconvert (VTI.LdFrag addr:$src2))))>, + EVEX_4V, EVEX_CD8<8, CD8VF>, T8PD, + Sched<[sched.Folded, ReadAfterLd]>; +} + +multiclass VPSHUFBITQMB_common<X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTI> { + let Predicates = [HasBITALG] in + defm Z : VPSHUFBITQMB_rm<sched.ZMM, VTI.info512>, EVEX_V512; + let Predicates = [HasBITALG, HasVLX] in { + defm Z256 : VPSHUFBITQMB_rm<sched.YMM, VTI.info256>, EVEX_V256; + defm Z128 : VPSHUFBITQMB_rm<sched.XMM, VTI.info128>, EVEX_V128; + } +} + +// FIXME: Is there a better scheduler class for VPSHUFBITQMB? +defm VPSHUFBITQMB : VPSHUFBITQMB_common<SchedWriteVecIMul, avx512vl_i8_info>; + +//===----------------------------------------------------------------------===// +// GFNI +//===----------------------------------------------------------------------===// + +multiclass GF2P8MULB_avx512_common<bits<8> Op, string OpStr, SDNode OpNode, + X86SchedWriteWidths sched> { + let Predicates = [HasGFNI, HasAVX512, HasBWI] in + defm Z : avx512_binop_rm<Op, OpStr, OpNode, v64i8_info, sched.ZMM, 1>, + EVEX_V512; + let Predicates = [HasGFNI, HasVLX, HasBWI] in { + defm Z256 : avx512_binop_rm<Op, OpStr, OpNode, v32i8x_info, sched.YMM, 1>, + EVEX_V256; + defm Z128 : avx512_binop_rm<Op, OpStr, OpNode, v16i8x_info, sched.XMM, 1>, + EVEX_V128; + } +} + +defm VGF2P8MULB : GF2P8MULB_avx512_common<0xCF, "vgf2p8mulb", X86GF2P8mulb, + SchedWriteVecALU>, + EVEX_CD8<8, CD8VF>, T8PD; + +multiclass GF2P8AFFINE_avx512_rmb_imm<bits<8> Op, string OpStr, SDNode OpNode, + X86FoldableSchedWrite sched, X86VectorVTInfo VTI, + X86VectorVTInfo BcstVTI> + : avx512_3Op_rm_imm8<Op, OpStr, OpNode, sched, VTI, VTI> { + let ExeDomain = VTI.ExeDomain in + defm rmbi : AVX512_maskable<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst), + (ins VTI.RC:$src1, VTI.ScalarMemOp:$src2, u8imm:$src3), + OpStr, "$src3, ${src2}"##BcstVTI.BroadcastStr##", $src1", + "$src1, ${src2}"##BcstVTI.BroadcastStr##", $src3", + (OpNode (VTI.VT VTI.RC:$src1), + (bitconvert (BcstVTI.VT (X86VBroadcast (loadi64 addr:$src2)))), + (i8 imm:$src3))>, EVEX_B, + Sched<[sched.Folded, ReadAfterLd]>; +} + +multiclass GF2P8AFFINE_avx512_common<bits<8> Op, string OpStr, SDNode OpNode, + X86SchedWriteWidths sched> { + let Predicates = [HasGFNI, HasAVX512, HasBWI] in + defm Z : GF2P8AFFINE_avx512_rmb_imm<Op, OpStr, OpNode, sched.ZMM, + v64i8_info, v8i64_info>, EVEX_V512; + let Predicates = [HasGFNI, HasVLX, HasBWI] in { + defm Z256 : GF2P8AFFINE_avx512_rmb_imm<Op, OpStr, OpNode, sched.YMM, + v32i8x_info, v4i64x_info>, EVEX_V256; + defm Z128 : GF2P8AFFINE_avx512_rmb_imm<Op, OpStr, OpNode, sched.XMM, + v16i8x_info, v2i64x_info>, EVEX_V128; + } +} + +defm VGF2P8AFFINEINVQB : GF2P8AFFINE_avx512_common<0xCF, "vgf2p8affineinvqb", + X86GF2P8affineinvqb, SchedWriteVecIMul>, + EVEX_4V, EVEX_CD8<8, CD8VF>, VEX_W, AVX512AIi8Base; +defm VGF2P8AFFINEQB : GF2P8AFFINE_avx512_common<0xCE, "vgf2p8affineqb", + X86GF2P8affineqb, SchedWriteVecIMul>, + EVEX_4V, EVEX_CD8<8, CD8VF>, VEX_W, AVX512AIi8Base; + + +//===----------------------------------------------------------------------===// +// AVX5124FMAPS +//===----------------------------------------------------------------------===// + +let hasSideEffects = 0, mayLoad = 1, ExeDomain = SSEPackedSingle, + Constraints = "$src1 = $dst" in { +defm V4FMADDPSrm : AVX512_maskable_3src_in_asm<0x9A, MRMSrcMem, v16f32_info, + (outs VR512:$dst), (ins VR512:$src2, f128mem:$src3), + "v4fmaddps", "$src3, $src2", "$src2, $src3", + []>, EVEX_V512, EVEX_4V, T8XD, EVEX_CD8<32, CD8VQ>, + Sched<[SchedWriteFMA.ZMM.Folded]>; + +defm V4FNMADDPSrm : AVX512_maskable_3src_in_asm<0xAA, MRMSrcMem, v16f32_info, + (outs VR512:$dst), (ins VR512:$src2, f128mem:$src3), + "v4fnmaddps", "$src3, $src2", "$src2, $src3", + []>, EVEX_V512, EVEX_4V, T8XD, EVEX_CD8<32, CD8VQ>, + Sched<[SchedWriteFMA.ZMM.Folded]>; + +defm V4FMADDSSrm : AVX512_maskable_3src_in_asm<0x9B, MRMSrcMem, f32x_info, + (outs VR128X:$dst), (ins VR128X:$src2, f128mem:$src3), + "v4fmaddss", "$src3, $src2", "$src2, $src3", + []>, EVEX_V128, EVEX_4V, T8XD, EVEX_CD8<32, CD8VF>, + Sched<[SchedWriteFMA.Scl.Folded]>; + +defm V4FNMADDSSrm : AVX512_maskable_3src_in_asm<0xAB, MRMSrcMem, f32x_info, + (outs VR128X:$dst), (ins VR128X:$src2, f128mem:$src3), + "v4fnmaddss", "$src3, $src2", "$src2, $src3", + []>, EVEX_V128, EVEX_4V, T8XD, EVEX_CD8<32, CD8VF>, + Sched<[SchedWriteFMA.Scl.Folded]>; +} + +//===----------------------------------------------------------------------===// +// AVX5124VNNIW +//===----------------------------------------------------------------------===// + +let hasSideEffects = 0, mayLoad = 1, ExeDomain = SSEPackedInt, + Constraints = "$src1 = $dst" in { +defm VP4DPWSSDrm : AVX512_maskable_3src_in_asm<0x52, MRMSrcMem, v16i32_info, + (outs VR512:$dst), (ins VR512:$src2, f128mem:$src3), + "vp4dpwssd", "$src3, $src2", "$src2, $src3", + []>, EVEX_V512, EVEX_4V, T8XD, EVEX_CD8<32, CD8VQ>, + Sched<[SchedWriteFMA.ZMM.Folded]>; + +defm VP4DPWSSDSrm : AVX512_maskable_3src_in_asm<0x53, MRMSrcMem, v16i32_info, + (outs VR512:$dst), (ins VR512:$src2, f128mem:$src3), + "vp4dpwssds", "$src3, $src2", "$src2, $src3", + []>, EVEX_V512, EVEX_4V, T8XD, EVEX_CD8<32, CD8VQ>, + Sched<[SchedWriteFMA.ZMM.Folded]>; +} + diff --git a/capstone/suite/synctools/tablegen/X86/back/X86InstrArithmetic.td b/capstone/suite/synctools/tablegen/X86/back/X86InstrArithmetic.td new file mode 100644 index 000000000..c444fa761 --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86InstrArithmetic.td @@ -0,0 +1,1338 @@ +//===-- X86InstrArithmetic.td - Integer Arithmetic Instrs --*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the integer arithmetic instructions in the X86 +// architecture. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// LEA - Load Effective Address +let SchedRW = [WriteLEA] in { +let hasSideEffects = 0 in +def LEA16r : I<0x8D, MRMSrcMem, + (outs GR16:$dst), (ins anymem:$src), + "lea{w}\t{$src|$dst}, {$dst|$src}", []>, OpSize16; +let isReMaterializable = 1 in +def LEA32r : I<0x8D, MRMSrcMem, + (outs GR32:$dst), (ins anymem:$src), + "lea{l}\t{$src|$dst}, {$dst|$src}", + [(set GR32:$dst, lea32addr:$src)]>, + OpSize32, Requires<[Not64BitMode]>; + +def LEA64_32r : I<0x8D, MRMSrcMem, + (outs GR32:$dst), (ins lea64_32mem:$src), + "lea{l}\t{$src|$dst}, {$dst|$src}", + [(set GR32:$dst, lea64_32addr:$src)]>, + OpSize32, Requires<[In64BitMode]>; + +let isReMaterializable = 1 in +def LEA64r : RI<0x8D, MRMSrcMem, (outs GR64:$dst), (ins lea64mem:$src), + "lea{q}\t{$src|$dst}, {$dst|$src}", + [(set GR64:$dst, lea64addr:$src)]>; +} // SchedRW + +//===----------------------------------------------------------------------===// +// Fixed-Register Multiplication and Division Instructions. +// + +// SchedModel info for instruction that loads one value and gets the second +// (and possibly third) value from a register. +// This is used for instructions that put the memory operands before other +// uses. +class SchedLoadReg<SchedWrite SW> : Sched<[SW, + // Memory operand. + ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault, + // Register reads (implicit or explicit). + ReadAfterLd, ReadAfterLd]>; + +// Extra precision multiplication + +// AL is really implied by AX, but the registers in Defs must match the +// SDNode results (i8, i32). +// AL,AH = AL*GR8 +let Defs = [AL,EFLAGS,AX], Uses = [AL] in +def MUL8r : I<0xF6, MRM4r, (outs), (ins GR8:$src), "mul{b}\t$src", + // FIXME: Used for 8-bit mul, ignore result upper 8 bits. + // This probably ought to be moved to a def : Pat<> if the + // syntax can be accepted. + [(set AL, (mul AL, GR8:$src)), + (implicit EFLAGS)]>, Sched<[WriteIMul]>; +// AX,DX = AX*GR16 +let Defs = [AX,DX,EFLAGS], Uses = [AX], hasSideEffects = 0 in +def MUL16r : I<0xF7, MRM4r, (outs), (ins GR16:$src), + "mul{w}\t$src", + []>, OpSize16, Sched<[WriteIMul]>; +// EAX,EDX = EAX*GR32 +let Defs = [EAX,EDX,EFLAGS], Uses = [EAX], hasSideEffects = 0 in +def MUL32r : I<0xF7, MRM4r, (outs), (ins GR32:$src), + "mul{l}\t$src", + [/*(set EAX, EDX, EFLAGS, (X86umul_flag EAX, GR32:$src))*/]>, + OpSize32, Sched<[WriteIMul]>; +// RAX,RDX = RAX*GR64 +let Defs = [RAX,RDX,EFLAGS], Uses = [RAX], hasSideEffects = 0 in +def MUL64r : RI<0xF7, MRM4r, (outs), (ins GR64:$src), + "mul{q}\t$src", + [/*(set RAX, RDX, EFLAGS, (X86umul_flag RAX, GR64:$src))*/]>, + Sched<[WriteIMul64]>; +// AL,AH = AL*[mem8] +let Defs = [AL,EFLAGS,AX], Uses = [AL] in +def MUL8m : I<0xF6, MRM4m, (outs), (ins i8mem :$src), + "mul{b}\t$src", + // FIXME: Used for 8-bit mul, ignore result upper 8 bits. + // This probably ought to be moved to a def : Pat<> if the + // syntax can be accepted. + [(set AL, (mul AL, (loadi8 addr:$src))), + (implicit EFLAGS)]>, SchedLoadReg<WriteIMul.Folded>; +// AX,DX = AX*[mem16] +let mayLoad = 1, hasSideEffects = 0 in { +let Defs = [AX,DX,EFLAGS], Uses = [AX] in +def MUL16m : I<0xF7, MRM4m, (outs), (ins i16mem:$src), + "mul{w}\t$src", []>, OpSize16, SchedLoadReg<WriteIMul.Folded>; +// EAX,EDX = EAX*[mem32] +let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in +def MUL32m : I<0xF7, MRM4m, (outs), (ins i32mem:$src), + "mul{l}\t$src", []>, OpSize32, SchedLoadReg<WriteIMul.Folded>; +// RAX,RDX = RAX*[mem64] +let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in +def MUL64m : RI<0xF7, MRM4m, (outs), (ins i64mem:$src), + "mul{q}\t$src", []>, SchedLoadReg<WriteIMul64.Folded>, + Requires<[In64BitMode]>; +} + +let hasSideEffects = 0 in { +// AL,AH = AL*GR8 +let Defs = [AL,EFLAGS,AX], Uses = [AL] in +def IMUL8r : I<0xF6, MRM5r, (outs), (ins GR8:$src), "imul{b}\t$src", []>, + Sched<[WriteIMul]>; +// AX,DX = AX*GR16 +let Defs = [AX,DX,EFLAGS], Uses = [AX] in +def IMUL16r : I<0xF7, MRM5r, (outs), (ins GR16:$src), "imul{w}\t$src", []>, + OpSize16, Sched<[WriteIMul]>; +// EAX,EDX = EAX*GR32 +let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in +def IMUL32r : I<0xF7, MRM5r, (outs), (ins GR32:$src), "imul{l}\t$src", []>, + OpSize32, Sched<[WriteIMul]>; +// RAX,RDX = RAX*GR64 +let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in +def IMUL64r : RI<0xF7, MRM5r, (outs), (ins GR64:$src), "imul{q}\t$src", []>, + Sched<[WriteIMul64]>; + +let mayLoad = 1 in { +// AL,AH = AL*[mem8] +let Defs = [AL,EFLAGS,AX], Uses = [AL] in +def IMUL8m : I<0xF6, MRM5m, (outs), (ins i8mem :$src), + "imul{b}\t$src", []>, SchedLoadReg<WriteIMul.Folded>; +// AX,DX = AX*[mem16] +let Defs = [AX,DX,EFLAGS], Uses = [AX] in +def IMUL16m : I<0xF7, MRM5m, (outs), (ins i16mem:$src), + "imul{w}\t$src", []>, OpSize16, SchedLoadReg<WriteIMul.Folded>; +// EAX,EDX = EAX*[mem32] +let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in +def IMUL32m : I<0xF7, MRM5m, (outs), (ins i32mem:$src), + "imul{l}\t$src", []>, OpSize32, SchedLoadReg<WriteIMul.Folded>; +// RAX,RDX = RAX*[mem64] +let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in +def IMUL64m : RI<0xF7, MRM5m, (outs), (ins i64mem:$src), + "imul{q}\t$src", []>, SchedLoadReg<WriteIMul64.Folded>, + Requires<[In64BitMode]>; +} +} // hasSideEffects + + +let Defs = [EFLAGS] in { +let Constraints = "$src1 = $dst" in { + +let isCommutable = 1 in { +// X = IMUL Y, Z --> X = IMUL Z, Y +// Register-Register Signed Integer Multiply +def IMUL16rr : I<0xAF, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1,GR16:$src2), + "imul{w}\t{$src2, $dst|$dst, $src2}", + [(set GR16:$dst, EFLAGS, + (X86smul_flag GR16:$src1, GR16:$src2))]>, + Sched<[WriteIMul]>, TB, OpSize16; +def IMUL32rr : I<0xAF, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1,GR32:$src2), + "imul{l}\t{$src2, $dst|$dst, $src2}", + [(set GR32:$dst, EFLAGS, + (X86smul_flag GR32:$src1, GR32:$src2))]>, + Sched<[WriteIMul]>, TB, OpSize32; +def IMUL64rr : RI<0xAF, MRMSrcReg, (outs GR64:$dst), + (ins GR64:$src1, GR64:$src2), + "imul{q}\t{$src2, $dst|$dst, $src2}", + [(set GR64:$dst, EFLAGS, + (X86smul_flag GR64:$src1, GR64:$src2))]>, + Sched<[WriteIMul64]>, TB; +} // isCommutable + +// Register-Memory Signed Integer Multiply +def IMUL16rm : I<0xAF, MRMSrcMem, (outs GR16:$dst), + (ins GR16:$src1, i16mem:$src2), + "imul{w}\t{$src2, $dst|$dst, $src2}", + [(set GR16:$dst, EFLAGS, + (X86smul_flag GR16:$src1, (loadi16 addr:$src2)))]>, + Sched<[WriteIMul.Folded, ReadAfterLd]>, TB, OpSize16; +def IMUL32rm : I<0xAF, MRMSrcMem, (outs GR32:$dst), + (ins GR32:$src1, i32mem:$src2), + "imul{l}\t{$src2, $dst|$dst, $src2}", + [(set GR32:$dst, EFLAGS, + (X86smul_flag GR32:$src1, (loadi32 addr:$src2)))]>, + Sched<[WriteIMul.Folded, ReadAfterLd]>, TB, OpSize32; +def IMUL64rm : RI<0xAF, MRMSrcMem, (outs GR64:$dst), + (ins GR64:$src1, i64mem:$src2), + "imul{q}\t{$src2, $dst|$dst, $src2}", + [(set GR64:$dst, EFLAGS, + (X86smul_flag GR64:$src1, (loadi64 addr:$src2)))]>, + Sched<[WriteIMul64.Folded, ReadAfterLd]>, TB; +} // Constraints = "$src1 = $dst" + +} // Defs = [EFLAGS] + +// Surprisingly enough, these are not two address instructions! +let Defs = [EFLAGS] in { +// Register-Integer Signed Integer Multiply +def IMUL16rri : Ii16<0x69, MRMSrcReg, // GR16 = GR16*I16 + (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2), + "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR16:$dst, EFLAGS, + (X86smul_flag GR16:$src1, imm:$src2))]>, + Sched<[WriteIMul]>, OpSize16; +def IMUL16rri8 : Ii8<0x6B, MRMSrcReg, // GR16 = GR16*I8 + (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), + "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR16:$dst, EFLAGS, + (X86smul_flag GR16:$src1, i16immSExt8:$src2))]>, + Sched<[WriteIMul]>, OpSize16; +def IMUL32rri : Ii32<0x69, MRMSrcReg, // GR32 = GR32*I32 + (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2), + "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32:$dst, EFLAGS, + (X86smul_flag GR32:$src1, imm:$src2))]>, + Sched<[WriteIMul]>, OpSize32; +def IMUL32rri8 : Ii8<0x6B, MRMSrcReg, // GR32 = GR32*I8 + (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2), + "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32:$dst, EFLAGS, + (X86smul_flag GR32:$src1, i32immSExt8:$src2))]>, + Sched<[WriteIMul]>, OpSize32; +def IMUL64rri32 : RIi32S<0x69, MRMSrcReg, // GR64 = GR64*I32 + (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2), + "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR64:$dst, EFLAGS, + (X86smul_flag GR64:$src1, i64immSExt32:$src2))]>, + Sched<[WriteIMul64]>; +def IMUL64rri8 : RIi8<0x6B, MRMSrcReg, // GR64 = GR64*I8 + (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2), + "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR64:$dst, EFLAGS, + (X86smul_flag GR64:$src1, i64immSExt8:$src2))]>, + Sched<[WriteIMul64]>; + +// Memory-Integer Signed Integer Multiply +def IMUL16rmi : Ii16<0x69, MRMSrcMem, // GR16 = [mem16]*I16 + (outs GR16:$dst), (ins i16mem:$src1, i16imm:$src2), + "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR16:$dst, EFLAGS, + (X86smul_flag (loadi16 addr:$src1), imm:$src2))]>, + Sched<[WriteIMul.Folded]>, OpSize16; +def IMUL16rmi8 : Ii8<0x6B, MRMSrcMem, // GR16 = [mem16]*I8 + (outs GR16:$dst), (ins i16mem:$src1, i16i8imm :$src2), + "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR16:$dst, EFLAGS, + (X86smul_flag (loadi16 addr:$src1), + i16immSExt8:$src2))]>, + Sched<[WriteIMul.Folded]>, OpSize16; +def IMUL32rmi : Ii32<0x69, MRMSrcMem, // GR32 = [mem32]*I32 + (outs GR32:$dst), (ins i32mem:$src1, i32imm:$src2), + "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32:$dst, EFLAGS, + (X86smul_flag (loadi32 addr:$src1), imm:$src2))]>, + Sched<[WriteIMul.Folded]>, OpSize32; +def IMUL32rmi8 : Ii8<0x6B, MRMSrcMem, // GR32 = [mem32]*I8 + (outs GR32:$dst), (ins i32mem:$src1, i32i8imm: $src2), + "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32:$dst, EFLAGS, + (X86smul_flag (loadi32 addr:$src1), + i32immSExt8:$src2))]>, + Sched<[WriteIMul.Folded]>, OpSize32; +def IMUL64rmi32 : RIi32S<0x69, MRMSrcMem, // GR64 = [mem64]*I32 + (outs GR64:$dst), (ins i64mem:$src1, i64i32imm:$src2), + "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR64:$dst, EFLAGS, + (X86smul_flag (loadi64 addr:$src1), + i64immSExt32:$src2))]>, + Sched<[WriteIMul64.Folded]>; +def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem, // GR64 = [mem64]*I8 + (outs GR64:$dst), (ins i64mem:$src1, i64i8imm: $src2), + "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR64:$dst, EFLAGS, + (X86smul_flag (loadi64 addr:$src1), + i64immSExt8:$src2))]>, + Sched<[WriteIMul64.Folded]>; +} // Defs = [EFLAGS] + +// unsigned division/remainder +let hasSideEffects = 1 in { // so that we don't speculatively execute +let Defs = [AL,AH,EFLAGS], Uses = [AX] in +def DIV8r : I<0xF6, MRM6r, (outs), (ins GR8:$src), // AX/r8 = AL,AH + "div{b}\t$src", []>, Sched<[WriteDiv8]>; +let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in +def DIV16r : I<0xF7, MRM6r, (outs), (ins GR16:$src), // DX:AX/r16 = AX,DX + "div{w}\t$src", []>, Sched<[WriteDiv16]>, OpSize16; +let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in +def DIV32r : I<0xF7, MRM6r, (outs), (ins GR32:$src), // EDX:EAX/r32 = EAX,EDX + "div{l}\t$src", []>, Sched<[WriteDiv32]>, OpSize32; +// RDX:RAX/r64 = RAX,RDX +let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in +def DIV64r : RI<0xF7, MRM6r, (outs), (ins GR64:$src), + "div{q}\t$src", []>, Sched<[WriteDiv64]>; + +let mayLoad = 1 in { +let Defs = [AL,AH,EFLAGS], Uses = [AX] in +def DIV8m : I<0xF6, MRM6m, (outs), (ins i8mem:$src), // AX/[mem8] = AL,AH + "div{b}\t$src", []>, SchedLoadReg<WriteDiv8.Folded>; +let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in +def DIV16m : I<0xF7, MRM6m, (outs), (ins i16mem:$src), // DX:AX/[mem16] = AX,DX + "div{w}\t$src", []>, OpSize16, SchedLoadReg<WriteDiv16.Folded>; +let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in // EDX:EAX/[mem32] = EAX,EDX +def DIV32m : I<0xF7, MRM6m, (outs), (ins i32mem:$src), + "div{l}\t$src", []>, SchedLoadReg<WriteDiv32.Folded>, OpSize32; +// RDX:RAX/[mem64] = RAX,RDX +let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in +def DIV64m : RI<0xF7, MRM6m, (outs), (ins i64mem:$src), + "div{q}\t$src", []>, SchedLoadReg<WriteDiv64.Folded>, + Requires<[In64BitMode]>; +} + +// Signed division/remainder. +let Defs = [AL,AH,EFLAGS], Uses = [AX] in +def IDIV8r : I<0xF6, MRM7r, (outs), (ins GR8:$src), // AX/r8 = AL,AH + "idiv{b}\t$src", []>, Sched<[WriteIDiv8]>; +let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in +def IDIV16r: I<0xF7, MRM7r, (outs), (ins GR16:$src), // DX:AX/r16 = AX,DX + "idiv{w}\t$src", []>, Sched<[WriteIDiv16]>, OpSize16; +let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in +def IDIV32r: I<0xF7, MRM7r, (outs), (ins GR32:$src), // EDX:EAX/r32 = EAX,EDX + "idiv{l}\t$src", []>, Sched<[WriteIDiv32]>, OpSize32; +// RDX:RAX/r64 = RAX,RDX +let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in +def IDIV64r: RI<0xF7, MRM7r, (outs), (ins GR64:$src), + "idiv{q}\t$src", []>, Sched<[WriteIDiv64]>; + +let mayLoad = 1 in { +let Defs = [AL,AH,EFLAGS], Uses = [AX] in +def IDIV8m : I<0xF6, MRM7m, (outs), (ins i8mem:$src), // AX/[mem8] = AL,AH + "idiv{b}\t$src", []>, SchedLoadReg<WriteIDiv8.Folded>; +let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in +def IDIV16m: I<0xF7, MRM7m, (outs), (ins i16mem:$src), // DX:AX/[mem16] = AX,DX + "idiv{w}\t$src", []>, OpSize16, SchedLoadReg<WriteIDiv16.Folded>; +let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in // EDX:EAX/[mem32] = EAX,EDX +def IDIV32m: I<0xF7, MRM7m, (outs), (ins i32mem:$src), + "idiv{l}\t$src", []>, OpSize32, SchedLoadReg<WriteIDiv32.Folded>; +let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in // RDX:RAX/[mem64] = RAX,RDX +def IDIV64m: RI<0xF7, MRM7m, (outs), (ins i64mem:$src), + "idiv{q}\t$src", []>, SchedLoadReg<WriteIDiv64.Folded>, + Requires<[In64BitMode]>; +} +} // hasSideEffects = 0 + +//===----------------------------------------------------------------------===// +// Two address Instructions. +// + +// unary instructions +let CodeSize = 2 in { +let Defs = [EFLAGS] in { +let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in { +def NEG8r : I<0xF6, MRM3r, (outs GR8 :$dst), (ins GR8 :$src1), + "neg{b}\t$dst", + [(set GR8:$dst, (ineg GR8:$src1)), + (implicit EFLAGS)]>; +def NEG16r : I<0xF7, MRM3r, (outs GR16:$dst), (ins GR16:$src1), + "neg{w}\t$dst", + [(set GR16:$dst, (ineg GR16:$src1)), + (implicit EFLAGS)]>, OpSize16; +def NEG32r : I<0xF7, MRM3r, (outs GR32:$dst), (ins GR32:$src1), + "neg{l}\t$dst", + [(set GR32:$dst, (ineg GR32:$src1)), + (implicit EFLAGS)]>, OpSize32; +def NEG64r : RI<0xF7, MRM3r, (outs GR64:$dst), (ins GR64:$src1), "neg{q}\t$dst", + [(set GR64:$dst, (ineg GR64:$src1)), + (implicit EFLAGS)]>; +} // Constraints = "$src1 = $dst", SchedRW + +// Read-modify-write negate. +let SchedRW = [WriteALURMW] in { +def NEG8m : I<0xF6, MRM3m, (outs), (ins i8mem :$dst), + "neg{b}\t$dst", + [(store (ineg (loadi8 addr:$dst)), addr:$dst), + (implicit EFLAGS)]>; +def NEG16m : I<0xF7, MRM3m, (outs), (ins i16mem:$dst), + "neg{w}\t$dst", + [(store (ineg (loadi16 addr:$dst)), addr:$dst), + (implicit EFLAGS)]>, OpSize16; +def NEG32m : I<0xF7, MRM3m, (outs), (ins i32mem:$dst), + "neg{l}\t$dst", + [(store (ineg (loadi32 addr:$dst)), addr:$dst), + (implicit EFLAGS)]>, OpSize32; +def NEG64m : RI<0xF7, MRM3m, (outs), (ins i64mem:$dst), "neg{q}\t$dst", + [(store (ineg (loadi64 addr:$dst)), addr:$dst), + (implicit EFLAGS)]>, + Requires<[In64BitMode]>; +} // SchedRW +} // Defs = [EFLAGS] + + +// Note: NOT does not set EFLAGS! + +let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in { +def NOT8r : I<0xF6, MRM2r, (outs GR8 :$dst), (ins GR8 :$src1), + "not{b}\t$dst", + [(set GR8:$dst, (not GR8:$src1))]>; +def NOT16r : I<0xF7, MRM2r, (outs GR16:$dst), (ins GR16:$src1), + "not{w}\t$dst", + [(set GR16:$dst, (not GR16:$src1))]>, OpSize16; +def NOT32r : I<0xF7, MRM2r, (outs GR32:$dst), (ins GR32:$src1), + "not{l}\t$dst", + [(set GR32:$dst, (not GR32:$src1))]>, OpSize32; +def NOT64r : RI<0xF7, MRM2r, (outs GR64:$dst), (ins GR64:$src1), "not{q}\t$dst", + [(set GR64:$dst, (not GR64:$src1))]>; +} // Constraints = "$src1 = $dst", SchedRW + +let SchedRW = [WriteALURMW] in { +def NOT8m : I<0xF6, MRM2m, (outs), (ins i8mem :$dst), + "not{b}\t$dst", + [(store (not (loadi8 addr:$dst)), addr:$dst)]>; +def NOT16m : I<0xF7, MRM2m, (outs), (ins i16mem:$dst), + "not{w}\t$dst", + [(store (not (loadi16 addr:$dst)), addr:$dst)]>, + OpSize16; +def NOT32m : I<0xF7, MRM2m, (outs), (ins i32mem:$dst), + "not{l}\t$dst", + [(store (not (loadi32 addr:$dst)), addr:$dst)]>, + OpSize32; +def NOT64m : RI<0xF7, MRM2m, (outs), (ins i64mem:$dst), "not{q}\t$dst", + [(store (not (loadi64 addr:$dst)), addr:$dst)]>, + Requires<[In64BitMode]>; +} // SchedRW +} // CodeSize + +// TODO: inc/dec is slow for P4, but fast for Pentium-M. +let Defs = [EFLAGS] in { +let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in { +let CodeSize = 2 in +def INC8r : I<0xFE, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1), + "inc{b}\t$dst", + [(set GR8:$dst, EFLAGS, (X86inc_flag GR8:$src1))]>; +let isConvertibleToThreeAddress = 1, CodeSize = 2 in { // Can xform into LEA. +def INC16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src1), + "inc{w}\t$dst", + [(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))]>, OpSize16; +def INC32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src1), + "inc{l}\t$dst", + [(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src1))]>, OpSize32; +def INC64r : RI<0xFF, MRM0r, (outs GR64:$dst), (ins GR64:$src1), "inc{q}\t$dst", + [(set GR64:$dst, EFLAGS, (X86inc_flag GR64:$src1))]>; +} // isConvertibleToThreeAddress = 1, CodeSize = 2 + +// Short forms only valid in 32-bit mode. Selected during MCInst lowering. +let CodeSize = 1, hasSideEffects = 0 in { +def INC16r_alt : I<0x40, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1), + "inc{w}\t$dst", []>, + OpSize16, Requires<[Not64BitMode]>; +def INC32r_alt : I<0x40, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1), + "inc{l}\t$dst", []>, + OpSize32, Requires<[Not64BitMode]>; +} // CodeSize = 1, hasSideEffects = 0 +} // Constraints = "$src1 = $dst", SchedRW + +let CodeSize = 2, SchedRW = [WriteALURMW] in { +let Predicates = [UseIncDec] in { + def INC8m : I<0xFE, MRM0m, (outs), (ins i8mem :$dst), "inc{b}\t$dst", + [(store (add (loadi8 addr:$dst), 1), addr:$dst), + (implicit EFLAGS)]>; + def INC16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), "inc{w}\t$dst", + [(store (add (loadi16 addr:$dst), 1), addr:$dst), + (implicit EFLAGS)]>, OpSize16; + def INC32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst", + [(store (add (loadi32 addr:$dst), 1), addr:$dst), + (implicit EFLAGS)]>, OpSize32; +} // Predicates +let Predicates = [UseIncDec, In64BitMode] in { + def INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst), "inc{q}\t$dst", + [(store (add (loadi64 addr:$dst), 1), addr:$dst), + (implicit EFLAGS)]>; +} // Predicates +} // CodeSize = 2, SchedRW + +let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in { +let CodeSize = 2 in +def DEC8r : I<0xFE, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1), + "dec{b}\t$dst", + [(set GR8:$dst, EFLAGS, (X86dec_flag GR8:$src1))]>; +let isConvertibleToThreeAddress = 1, CodeSize = 2 in { // Can xform into LEA. +def DEC16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src1), + "dec{w}\t$dst", + [(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src1))]>, OpSize16; +def DEC32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1), + "dec{l}\t$dst", + [(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src1))]>, OpSize32; +def DEC64r : RI<0xFF, MRM1r, (outs GR64:$dst), (ins GR64:$src1), "dec{q}\t$dst", + [(set GR64:$dst, EFLAGS, (X86dec_flag GR64:$src1))]>; +} // isConvertibleToThreeAddress = 1, CodeSize = 2 + +// Short forms only valid in 32-bit mode. Selected during MCInst lowering. +let CodeSize = 1, hasSideEffects = 0 in { +def DEC16r_alt : I<0x48, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1), + "dec{w}\t$dst", []>, + OpSize16, Requires<[Not64BitMode]>; +def DEC32r_alt : I<0x48, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1), + "dec{l}\t$dst", []>, + OpSize32, Requires<[Not64BitMode]>; +} // CodeSize = 1, hasSideEffects = 0 +} // Constraints = "$src1 = $dst", SchedRW + + +let CodeSize = 2, SchedRW = [WriteALURMW] in { +let Predicates = [UseIncDec] in { + def DEC8m : I<0xFE, MRM1m, (outs), (ins i8mem :$dst), "dec{b}\t$dst", + [(store (add (loadi8 addr:$dst), -1), addr:$dst), + (implicit EFLAGS)]>; + def DEC16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), "dec{w}\t$dst", + [(store (add (loadi16 addr:$dst), -1), addr:$dst), + (implicit EFLAGS)]>, OpSize16; + def DEC32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst", + [(store (add (loadi32 addr:$dst), -1), addr:$dst), + (implicit EFLAGS)]>, OpSize32; +} // Predicates +let Predicates = [UseIncDec, In64BitMode] in { + def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", + [(store (add (loadi64 addr:$dst), -1), addr:$dst), + (implicit EFLAGS)]>; +} // Predicates +} // CodeSize = 2, SchedRW +} // Defs = [EFLAGS] + +/// X86TypeInfo - This is a bunch of information that describes relevant X86 +/// information about value types. For example, it can tell you what the +/// register class and preferred load to use. +class X86TypeInfo<ValueType vt, string instrsuffix, RegisterClass regclass, + PatFrag loadnode, X86MemOperand memoperand, ImmType immkind, + Operand immoperand, SDPatternOperator immoperator, + Operand imm8operand, SDPatternOperator imm8operator, + bit hasOddOpcode, OperandSize opSize, + bit hasREX_WPrefix> { + /// VT - This is the value type itself. + ValueType VT = vt; + + /// InstrSuffix - This is the suffix used on instructions with this type. For + /// example, i8 -> "b", i16 -> "w", i32 -> "l", i64 -> "q". + string InstrSuffix = instrsuffix; + + /// RegClass - This is the register class associated with this type. For + /// example, i8 -> GR8, i16 -> GR16, i32 -> GR32, i64 -> GR64. + RegisterClass RegClass = regclass; + + /// LoadNode - This is the load node associated with this type. For + /// example, i8 -> loadi8, i16 -> loadi16, i32 -> loadi32, i64 -> loadi64. + PatFrag LoadNode = loadnode; + + /// MemOperand - This is the memory operand associated with this type. For + /// example, i8 -> i8mem, i16 -> i16mem, i32 -> i32mem, i64 -> i64mem. + X86MemOperand MemOperand = memoperand; + + /// ImmEncoding - This is the encoding of an immediate of this type. For + /// example, i8 -> Imm8, i16 -> Imm16, i32 -> Imm32. Note that i64 -> Imm32 + /// since the immediate fields of i64 instructions is a 32-bit sign extended + /// value. + ImmType ImmEncoding = immkind; + + /// ImmOperand - This is the operand kind of an immediate of this type. For + /// example, i8 -> i8imm, i16 -> i16imm, i32 -> i32imm. Note that i64 -> + /// i64i32imm since the immediate fields of i64 instructions is a 32-bit sign + /// extended value. + Operand ImmOperand = immoperand; + + /// ImmOperator - This is the operator that should be used to match an + /// immediate of this kind in a pattern (e.g. imm, or i64immSExt32). + SDPatternOperator ImmOperator = immoperator; + + /// Imm8Operand - This is the operand kind to use for an imm8 of this type. + /// For example, i8 -> <invalid>, i16 -> i16i8imm, i32 -> i32i8imm. This is + /// only used for instructions that have a sign-extended imm8 field form. + Operand Imm8Operand = imm8operand; + + /// Imm8Operator - This is the operator that should be used to match an 8-bit + /// sign extended immediate of this kind in a pattern (e.g. imm16immSExt8). + SDPatternOperator Imm8Operator = imm8operator; + + /// HasOddOpcode - This bit is true if the instruction should have an odd (as + /// opposed to even) opcode. Operations on i8 are usually even, operations on + /// other datatypes are odd. + bit HasOddOpcode = hasOddOpcode; + + /// OpSize - Selects whether the instruction needs a 0x66 prefix based on + /// 16-bit vs 32-bit mode. i8/i64 set this to OpSizeFixed. i16 sets this + /// to Opsize16. i32 sets this to OpSize32. + OperandSize OpSize = opSize; + + /// HasREX_WPrefix - This bit is set to true if the instruction should have + /// the 0x40 REX prefix. This is set for i64 types. + bit HasREX_WPrefix = hasREX_WPrefix; +} + +def invalid_node : SDNode<"<<invalid_node>>", SDTIntLeaf,[],"<<invalid_node>>">; + + +def Xi8 : X86TypeInfo<i8, "b", GR8, loadi8, i8mem, + Imm8, i8imm, imm8_su, i8imm, invalid_node, + 0, OpSizeFixed, 0>; +def Xi16 : X86TypeInfo<i16, "w", GR16, loadi16, i16mem, + Imm16, i16imm, imm16_su, i16i8imm, i16immSExt8_su, + 1, OpSize16, 0>; +def Xi32 : X86TypeInfo<i32, "l", GR32, loadi32, i32mem, + Imm32, i32imm, imm32_su, i32i8imm, i32immSExt8_su, + 1, OpSize32, 0>; +def Xi64 : X86TypeInfo<i64, "q", GR64, loadi64, i64mem, + Imm32S, i64i32imm, i64immSExt32_su, i64i8imm, i64immSExt8_su, + 1, OpSizeFixed, 1>; + +/// ITy - This instruction base class takes the type info for the instruction. +/// Using this, it: +/// 1. Concatenates together the instruction mnemonic with the appropriate +/// suffix letter, a tab, and the arguments. +/// 2. Infers whether the instruction should have a 0x66 prefix byte. +/// 3. Infers whether the instruction should have a 0x40 REX_W prefix. +/// 4. Infers whether the low bit of the opcode should be 0 (for i8 operations) +/// or 1 (for i16,i32,i64 operations). +class ITy<bits<8> opcode, Format f, X86TypeInfo typeinfo, dag outs, dag ins, + string mnemonic, string args, list<dag> pattern> + : I<{opcode{7}, opcode{6}, opcode{5}, opcode{4}, + opcode{3}, opcode{2}, opcode{1}, typeinfo.HasOddOpcode }, + f, outs, ins, + !strconcat(mnemonic, "{", typeinfo.InstrSuffix, "}\t", args), pattern> { + + // Infer instruction prefixes from type info. + let OpSize = typeinfo.OpSize; + let hasREX_WPrefix = typeinfo.HasREX_WPrefix; +} + +// BinOpRR - Instructions like "add reg, reg, reg". +class BinOpRR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + dag outlist, X86FoldableSchedWrite sched, list<dag> pattern> + : ITy<opcode, MRMDestReg, typeinfo, outlist, + (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2), + mnemonic, "{$src2, $src1|$src1, $src2}", pattern>, + Sched<[sched]>; + +// BinOpRR_F - Instructions like "cmp reg, Reg", where the pattern has +// just a EFLAGS as a result. +class BinOpRR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDPatternOperator opnode> + : BinOpRR<opcode, mnemonic, typeinfo, (outs), WriteALU, + [(set EFLAGS, + (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))]>; + +// BinOpRR_RF - Instructions like "add reg, reg, reg", where the pattern has +// both a regclass and EFLAGS as a result. +class BinOpRR_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode> + : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), WriteALU, + [(set typeinfo.RegClass:$dst, EFLAGS, + (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))]>; + +// BinOpRR_RFF - Instructions like "adc reg, reg, reg", where the pattern has +// both a regclass and EFLAGS as a result, and has EFLAGS as input. +class BinOpRR_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode> + : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), WriteADC, + [(set typeinfo.RegClass:$dst, EFLAGS, + (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2, + EFLAGS))]>; + +// BinOpRR_Rev - Instructions like "add reg, reg, reg" (reversed encoding). +class BinOpRR_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + X86FoldableSchedWrite sched = WriteALU> + : ITy<opcode, MRMSrcReg, typeinfo, + (outs typeinfo.RegClass:$dst), + (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2), + mnemonic, "{$src2, $dst|$dst, $src2}", []>, + Sched<[sched]> { + // The disassembler should know about this, but not the asmparser. + let isCodeGenOnly = 1; + let ForceDisassemble = 1; + let hasSideEffects = 0; +} + +// BinOpRR_RDD_Rev - Instructions like "adc reg, reg, reg" (reversed encoding). +class BinOpRR_RFF_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo> + : BinOpRR_Rev<opcode, mnemonic, typeinfo, WriteADC>; + +// BinOpRR_F_Rev - Instructions like "cmp reg, reg" (reversed encoding). +class BinOpRR_F_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo> + : ITy<opcode, MRMSrcReg, typeinfo, (outs), + (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2), + mnemonic, "{$src2, $src1|$src1, $src2}", []>, + Sched<[WriteALU]> { + // The disassembler should know about this, but not the asmparser. + let isCodeGenOnly = 1; + let ForceDisassemble = 1; + let hasSideEffects = 0; +} + +// BinOpRM - Instructions like "add reg, reg, [mem]". +class BinOpRM<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + dag outlist, X86FoldableSchedWrite sched, list<dag> pattern> + : ITy<opcode, MRMSrcMem, typeinfo, outlist, + (ins typeinfo.RegClass:$src1, typeinfo.MemOperand:$src2), + mnemonic, "{$src2, $src1|$src1, $src2}", pattern>, + Sched<[sched.Folded, ReadAfterLd]>; + +// BinOpRM_F - Instructions like "cmp reg, [mem]". +class BinOpRM_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode> + : BinOpRM<opcode, mnemonic, typeinfo, (outs), WriteALU, + [(set EFLAGS, + (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>; + +// BinOpRM_RF - Instructions like "add reg, reg, [mem]". +class BinOpRM_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode> + : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), WriteALU, + [(set typeinfo.RegClass:$dst, EFLAGS, + (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>; + +// BinOpRM_RFF - Instructions like "adc reg, reg, [mem]". +class BinOpRM_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode> + : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), WriteADC, + [(set typeinfo.RegClass:$dst, EFLAGS, + (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2), + EFLAGS))]>; + +// BinOpRI - Instructions like "add reg, reg, imm". +class BinOpRI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + Format f, dag outlist, X86FoldableSchedWrite sched, list<dag> pattern> + : ITy<opcode, f, typeinfo, outlist, + (ins typeinfo.RegClass:$src1, typeinfo.ImmOperand:$src2), + mnemonic, "{$src2, $src1|$src1, $src2}", pattern>, + Sched<[sched]> { + let ImmT = typeinfo.ImmEncoding; +} + +// BinOpRI_F - Instructions like "cmp reg, imm". +class BinOpRI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDPatternOperator opnode, Format f> + : BinOpRI<opcode, mnemonic, typeinfo, f, (outs), WriteALU, + [(set EFLAGS, + (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>; + +// BinOpRI_RF - Instructions like "add reg, reg, imm". +class BinOpRI_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode, Format f> + : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), WriteALU, + [(set typeinfo.RegClass:$dst, EFLAGS, + (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>; +// BinOpRI_RFF - Instructions like "adc reg, reg, imm". +class BinOpRI_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode, Format f> + : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), WriteADC, + [(set typeinfo.RegClass:$dst, EFLAGS, + (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2, + EFLAGS))]>; + +// BinOpRI8 - Instructions like "add reg, reg, imm8". +class BinOpRI8<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + Format f, dag outlist, X86FoldableSchedWrite sched, list<dag> pattern> + : ITy<opcode, f, typeinfo, outlist, + (ins typeinfo.RegClass:$src1, typeinfo.Imm8Operand:$src2), + mnemonic, "{$src2, $src1|$src1, $src2}", pattern>, + Sched<[sched]> { + let ImmT = Imm8; // Always 8-bit immediate. +} + +// BinOpRI8_F - Instructions like "cmp reg, imm8". +class BinOpRI8_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDPatternOperator opnode, Format f> + : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs), WriteALU, + [(set EFLAGS, + (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>; + +// BinOpRI8_RF - Instructions like "add reg, reg, imm8". +class BinOpRI8_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDPatternOperator opnode, Format f> + : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), WriteALU, + [(set typeinfo.RegClass:$dst, EFLAGS, + (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>; + +// BinOpRI8_RFF - Instructions like "adc reg, reg, imm8". +class BinOpRI8_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDPatternOperator opnode, Format f> + : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), WriteADC, + [(set typeinfo.RegClass:$dst, EFLAGS, + (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2, + EFLAGS))]>; + +// BinOpMR - Instructions like "add [mem], reg". +class BinOpMR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + list<dag> pattern> + : ITy<opcode, MRMDestMem, typeinfo, + (outs), (ins typeinfo.MemOperand:$dst, typeinfo.RegClass:$src), + mnemonic, "{$src, $dst|$dst, $src}", pattern>; + +// BinOpMR_RMW - Instructions like "add [mem], reg". +class BinOpMR_RMW<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode> + : BinOpMR<opcode, mnemonic, typeinfo, + [(store (opnode (load addr:$dst), typeinfo.RegClass:$src), addr:$dst), + (implicit EFLAGS)]>, Sched<[WriteALURMW]>; + +// BinOpMR_RMW_FF - Instructions like "adc [mem], reg". +class BinOpMR_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode> + : BinOpMR<opcode, mnemonic, typeinfo, + [(store (opnode (load addr:$dst), typeinfo.RegClass:$src, EFLAGS), + addr:$dst), + (implicit EFLAGS)]>, Sched<[WriteADCRMW]>; + +// BinOpMR_F - Instructions like "cmp [mem], reg". +class BinOpMR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDPatternOperator opnode> + : BinOpMR<opcode, mnemonic, typeinfo, + [(set EFLAGS, (opnode (typeinfo.LoadNode addr:$dst), + typeinfo.RegClass:$src))]>, + Sched<[WriteALULd, ReadDefault, ReadDefault, ReadDefault, + ReadDefault, ReadDefault, ReadAfterLd]>; + +// BinOpMI - Instructions like "add [mem], imm". +class BinOpMI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + Format f, list<dag> pattern> + : ITy<opcode, f, typeinfo, + (outs), (ins typeinfo.MemOperand:$dst, typeinfo.ImmOperand:$src), + mnemonic, "{$src, $dst|$dst, $src}", pattern> { + let ImmT = typeinfo.ImmEncoding; +} + +// BinOpMI_RMW - Instructions like "add [mem], imm". +class BinOpMI_RMW<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode, Format f> + : BinOpMI<opcode, mnemonic, typeinfo, f, + [(store (opnode (typeinfo.VT (load addr:$dst)), + typeinfo.ImmOperator:$src), addr:$dst), + (implicit EFLAGS)]>, Sched<[WriteALURMW]>; +// BinOpMI_RMW_FF - Instructions like "adc [mem], imm". +class BinOpMI_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDNode opnode, Format f> + : BinOpMI<opcode, mnemonic, typeinfo, f, + [(store (opnode (typeinfo.VT (load addr:$dst)), + typeinfo.ImmOperator:$src, EFLAGS), addr:$dst), + (implicit EFLAGS)]>, Sched<[WriteADCRMW]>; + +// BinOpMI_F - Instructions like "cmp [mem], imm". +class BinOpMI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + SDPatternOperator opnode, Format f> + : BinOpMI<opcode, mnemonic, typeinfo, f, + [(set EFLAGS, (opnode (typeinfo.LoadNode addr:$dst), + typeinfo.ImmOperator:$src))]>, + Sched<[WriteALULd]>; + +// BinOpMI8 - Instructions like "add [mem], imm8". +class BinOpMI8<string mnemonic, X86TypeInfo typeinfo, + Format f, list<dag> pattern> + : ITy<0x82, f, typeinfo, + (outs), (ins typeinfo.MemOperand:$dst, typeinfo.Imm8Operand:$src), + mnemonic, "{$src, $dst|$dst, $src}", pattern> { + let ImmT = Imm8; // Always 8-bit immediate. +} + +// BinOpMI8_RMW - Instructions like "add [mem], imm8". +class BinOpMI8_RMW<string mnemonic, X86TypeInfo typeinfo, + SDPatternOperator opnode, Format f> + : BinOpMI8<mnemonic, typeinfo, f, + [(store (opnode (load addr:$dst), + typeinfo.Imm8Operator:$src), addr:$dst), + (implicit EFLAGS)]>, Sched<[WriteALURMW]>; + +// BinOpMI8_RMW_FF - Instructions like "adc [mem], imm8". +class BinOpMI8_RMW_FF<string mnemonic, X86TypeInfo typeinfo, + SDPatternOperator opnode, Format f> + : BinOpMI8<mnemonic, typeinfo, f, + [(store (opnode (load addr:$dst), + typeinfo.Imm8Operator:$src, EFLAGS), addr:$dst), + (implicit EFLAGS)]>, Sched<[WriteADCRMW]>; + +// BinOpMI8_F - Instructions like "cmp [mem], imm8". +class BinOpMI8_F<string mnemonic, X86TypeInfo typeinfo, + SDPatternOperator opnode, Format f> + : BinOpMI8<mnemonic, typeinfo, f, + [(set EFLAGS, (opnode (typeinfo.LoadNode addr:$dst), + typeinfo.Imm8Operator:$src))]>, + Sched<[WriteALULd]>; + +// BinOpAI - Instructions like "add %eax, %eax, imm", that imp-def EFLAGS. +class BinOpAI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + Register areg, string operands, X86FoldableSchedWrite sched = WriteALU> + : ITy<opcode, RawFrm, typeinfo, + (outs), (ins typeinfo.ImmOperand:$src), + mnemonic, operands, []>, Sched<[sched]> { + let ImmT = typeinfo.ImmEncoding; + let Uses = [areg]; + let Defs = [areg, EFLAGS]; + let hasSideEffects = 0; +} + +// BinOpAI_RFF - Instructions like "adc %eax, %eax, imm", that implicitly define +// and use EFLAGS. +class BinOpAI_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + Register areg, string operands> + : BinOpAI<opcode, mnemonic, typeinfo, areg, operands, WriteADC> { + let Uses = [areg, EFLAGS]; +} + +// BinOpAI_F - Instructions like "cmp %eax, %eax, imm", that imp-def EFLAGS. +class BinOpAI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + Register areg, string operands> + : BinOpAI<opcode, mnemonic, typeinfo, areg, operands> { + let Defs = [EFLAGS]; +} + +/// ArithBinOp_RF - This is an arithmetic binary operator where the pattern is +/// defined with "(set GPR:$dst, EFLAGS, (...". +/// +/// It would be nice to get rid of the second and third argument here, but +/// tblgen can't handle dependent type references aggressively enough: PR8330 +multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, + string mnemonic, Format RegMRM, Format MemMRM, + SDNode opnodeflag, SDNode opnode, + bit CommutableRR, bit ConvertibleToThreeAddress> { + let Defs = [EFLAGS] in { + let Constraints = "$src1 = $dst" in { + let isCommutable = CommutableRR in { + def NAME#8rr : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag>; + let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { + def NAME#16rr : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag>; + def NAME#32rr : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag>; + def NAME#64rr : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag>; + } // isConvertibleToThreeAddress + } // isCommutable + + def NAME#8rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>, FoldGenData<NAME#8rr>; + def NAME#16rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi16>, FoldGenData<NAME#16rr>; + def NAME#32rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi32>, FoldGenData<NAME#32rr>; + def NAME#64rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi64>, FoldGenData<NAME#64rr>; + + def NAME#8rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag>; + def NAME#16rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag>; + def NAME#32rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag>; + def NAME#64rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag>; + + def NAME#8ri : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>; + + let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { + // NOTE: These are order specific, we want the ri8 forms to be listed + // first so that they are slightly preferred to the ri forms. + def NAME#16ri8 : BinOpRI8_RF<0x82, mnemonic, Xi16, opnodeflag, RegMRM>; + def NAME#32ri8 : BinOpRI8_RF<0x82, mnemonic, Xi32, opnodeflag, RegMRM>; + def NAME#64ri8 : BinOpRI8_RF<0x82, mnemonic, Xi64, opnodeflag, RegMRM>; + + def NAME#16ri : BinOpRI_RF<0x80, mnemonic, Xi16, opnodeflag, RegMRM>; + def NAME#32ri : BinOpRI_RF<0x80, mnemonic, Xi32, opnodeflag, RegMRM>; + def NAME#64ri32: BinOpRI_RF<0x80, mnemonic, Xi64, opnodeflag, RegMRM>; + } + } // Constraints = "$src1 = $dst" + + let mayLoad = 1, mayStore = 1 in { + def NAME#8mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi8 , opnode>; + def NAME#16mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi16, opnode>; + def NAME#32mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi32, opnode>; + def NAME#64mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi64, opnode>; + } + + // NOTE: These are order specific, we want the mi8 forms to be listed + // first so that they are slightly preferred to the mi forms. + def NAME#16mi8 : BinOpMI8_RMW<mnemonic, Xi16, opnode, MemMRM>; + def NAME#32mi8 : BinOpMI8_RMW<mnemonic, Xi32, opnode, MemMRM>; + let Predicates = [In64BitMode] in + def NAME#64mi8 : BinOpMI8_RMW<mnemonic, Xi64, opnode, MemMRM>; + + def NAME#8mi : BinOpMI_RMW<0x80, mnemonic, Xi8 , opnode, MemMRM>; + def NAME#16mi : BinOpMI_RMW<0x80, mnemonic, Xi16, opnode, MemMRM>; + def NAME#32mi : BinOpMI_RMW<0x80, mnemonic, Xi32, opnode, MemMRM>; + let Predicates = [In64BitMode] in + def NAME#64mi32 : BinOpMI_RMW<0x80, mnemonic, Xi64, opnode, MemMRM>; + + // These are for the disassembler since 0x82 opcode behaves like 0x80, but + // not in 64-bit mode. + let Predicates = [Not64BitMode], isCodeGenOnly = 1, ForceDisassemble = 1, + hasSideEffects = 0 in { + let Constraints = "$src1 = $dst" in + def NAME#8ri8 : BinOpRI8_RF<0x82, mnemonic, Xi8, null_frag, RegMRM>; + let mayLoad = 1, mayStore = 1 in + def NAME#8mi8 : BinOpMI8_RMW<mnemonic, Xi8, null_frag, MemMRM>; + } + } // Defs = [EFLAGS] + + def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL, + "{$src, %al|al, $src}">; + def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX, + "{$src, %ax|ax, $src}">; + def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX, + "{$src, %eax|eax, $src}">; + def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX, + "{$src, %rax|rax, $src}">; +} + +/// ArithBinOp_RFF - This is an arithmetic binary operator where the pattern is +/// defined with "(set GPR:$dst, EFLAGS, (node LHS, RHS, EFLAGS))" like ADC and +/// SBB. +/// +/// It would be nice to get rid of the second and third argument here, but +/// tblgen can't handle dependent type references aggressively enough: PR8330 +multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, + string mnemonic, Format RegMRM, Format MemMRM, + SDNode opnode, bit CommutableRR, + bit ConvertibleToThreeAddress> { + let Uses = [EFLAGS], Defs = [EFLAGS] in { + let Constraints = "$src1 = $dst" in { + let isCommutable = CommutableRR in { + def NAME#8rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi8 , opnode>; + let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { + def NAME#16rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi16, opnode>; + def NAME#32rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi32, opnode>; + def NAME#64rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi64, opnode>; + } // isConvertibleToThreeAddress + } // isCommutable + + def NAME#8rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi8>, FoldGenData<NAME#8rr>; + def NAME#16rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi16>, FoldGenData<NAME#16rr>; + def NAME#32rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi32>, FoldGenData<NAME#32rr>; + def NAME#64rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi64>, FoldGenData<NAME#64rr>; + + def NAME#8rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi8 , opnode>; + def NAME#16rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi16, opnode>; + def NAME#32rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi32, opnode>; + def NAME#64rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi64, opnode>; + + def NAME#8ri : BinOpRI_RFF<0x80, mnemonic, Xi8 , opnode, RegMRM>; + + let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { + // NOTE: These are order specific, we want the ri8 forms to be listed + // first so that they are slightly preferred to the ri forms. + def NAME#16ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi16, opnode, RegMRM>; + def NAME#32ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi32, opnode, RegMRM>; + def NAME#64ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi64, opnode, RegMRM>; + + def NAME#16ri : BinOpRI_RFF<0x80, mnemonic, Xi16, opnode, RegMRM>; + def NAME#32ri : BinOpRI_RFF<0x80, mnemonic, Xi32, opnode, RegMRM>; + def NAME#64ri32: BinOpRI_RFF<0x80, mnemonic, Xi64, opnode, RegMRM>; + } + } // Constraints = "$src1 = $dst" + + def NAME#8mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi8 , opnode>; + def NAME#16mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi16, opnode>; + def NAME#32mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi32, opnode>; + def NAME#64mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi64, opnode>; + + // NOTE: These are order specific, we want the mi8 forms to be listed + // first so that they are slightly preferred to the mi forms. + def NAME#16mi8 : BinOpMI8_RMW_FF<mnemonic, Xi16, opnode, MemMRM>; + def NAME#32mi8 : BinOpMI8_RMW_FF<mnemonic, Xi32, opnode, MemMRM>; + let Predicates = [In64BitMode] in + def NAME#64mi8 : BinOpMI8_RMW_FF<mnemonic, Xi64, opnode, MemMRM>; + + def NAME#8mi : BinOpMI_RMW_FF<0x80, mnemonic, Xi8 , opnode, MemMRM>; + def NAME#16mi : BinOpMI_RMW_FF<0x80, mnemonic, Xi16, opnode, MemMRM>; + def NAME#32mi : BinOpMI_RMW_FF<0x80, mnemonic, Xi32, opnode, MemMRM>; + let Predicates = [In64BitMode] in + def NAME#64mi32 : BinOpMI_RMW_FF<0x80, mnemonic, Xi64, opnode, MemMRM>; + + // These are for the disassembler since 0x82 opcode behaves like 0x80, but + // not in 64-bit mode. + let Predicates = [Not64BitMode], isCodeGenOnly = 1, ForceDisassemble = 1, + hasSideEffects = 0 in { + let Constraints = "$src1 = $dst" in + def NAME#8ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi8, null_frag, RegMRM>; + let mayLoad = 1, mayStore = 1 in + def NAME#8mi8 : BinOpMI8_RMW_FF<mnemonic, Xi8, null_frag, MemMRM>; + } + } // Uses = [EFLAGS], Defs = [EFLAGS] + + def NAME#8i8 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi8 , AL, + "{$src, %al|al, $src}">; + def NAME#16i16 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi16, AX, + "{$src, %ax|ax, $src}">; + def NAME#32i32 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi32, EAX, + "{$src, %eax|eax, $src}">; + def NAME#64i32 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi64, RAX, + "{$src, %rax|rax, $src}">; +} + +/// ArithBinOp_F - This is an arithmetic binary operator where the pattern is +/// defined with "(set EFLAGS, (...". It would be really nice to find a way +/// to factor this with the other ArithBinOp_*. +/// +multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, + string mnemonic, Format RegMRM, Format MemMRM, + SDNode opnode, + bit CommutableRR, bit ConvertibleToThreeAddress> { + let Defs = [EFLAGS] in { + let isCommutable = CommutableRR in { + def NAME#8rr : BinOpRR_F<BaseOpc, mnemonic, Xi8 , opnode>; + let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { + def NAME#16rr : BinOpRR_F<BaseOpc, mnemonic, Xi16, opnode>; + def NAME#32rr : BinOpRR_F<BaseOpc, mnemonic, Xi32, opnode>; + def NAME#64rr : BinOpRR_F<BaseOpc, mnemonic, Xi64, opnode>; + } + } // isCommutable + + def NAME#8rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi8>, FoldGenData<NAME#8rr>; + def NAME#16rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi16>, FoldGenData<NAME#16rr>; + def NAME#32rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi32>, FoldGenData<NAME#32rr>; + def NAME#64rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi64>, FoldGenData<NAME#64rr>; + + def NAME#8rm : BinOpRM_F<BaseOpc2, mnemonic, Xi8 , opnode>; + def NAME#16rm : BinOpRM_F<BaseOpc2, mnemonic, Xi16, opnode>; + def NAME#32rm : BinOpRM_F<BaseOpc2, mnemonic, Xi32, opnode>; + def NAME#64rm : BinOpRM_F<BaseOpc2, mnemonic, Xi64, opnode>; + + def NAME#8ri : BinOpRI_F<0x80, mnemonic, Xi8 , opnode, RegMRM>; + + let isConvertibleToThreeAddress = ConvertibleToThreeAddress in { + // NOTE: These are order specific, we want the ri8 forms to be listed + // first so that they are slightly preferred to the ri forms. + def NAME#16ri8 : BinOpRI8_F<0x82, mnemonic, Xi16, opnode, RegMRM>; + def NAME#32ri8 : BinOpRI8_F<0x82, mnemonic, Xi32, opnode, RegMRM>; + def NAME#64ri8 : BinOpRI8_F<0x82, mnemonic, Xi64, opnode, RegMRM>; + + def NAME#16ri : BinOpRI_F<0x80, mnemonic, Xi16, opnode, RegMRM>; + def NAME#32ri : BinOpRI_F<0x80, mnemonic, Xi32, opnode, RegMRM>; + def NAME#64ri32: BinOpRI_F<0x80, mnemonic, Xi64, opnode, RegMRM>; + } + + def NAME#8mr : BinOpMR_F<BaseOpc, mnemonic, Xi8 , opnode>; + def NAME#16mr : BinOpMR_F<BaseOpc, mnemonic, Xi16, opnode>; + def NAME#32mr : BinOpMR_F<BaseOpc, mnemonic, Xi32, opnode>; + def NAME#64mr : BinOpMR_F<BaseOpc, mnemonic, Xi64, opnode>; + + // NOTE: These are order specific, we want the mi8 forms to be listed + // first so that they are slightly preferred to the mi forms. + def NAME#16mi8 : BinOpMI8_F<mnemonic, Xi16, opnode, MemMRM>; + def NAME#32mi8 : BinOpMI8_F<mnemonic, Xi32, opnode, MemMRM>; + let Predicates = [In64BitMode] in + def NAME#64mi8 : BinOpMI8_F<mnemonic, Xi64, opnode, MemMRM>; + + def NAME#8mi : BinOpMI_F<0x80, mnemonic, Xi8 , opnode, MemMRM>; + def NAME#16mi : BinOpMI_F<0x80, mnemonic, Xi16, opnode, MemMRM>; + def NAME#32mi : BinOpMI_F<0x80, mnemonic, Xi32, opnode, MemMRM>; + let Predicates = [In64BitMode] in + def NAME#64mi32 : BinOpMI_F<0x80, mnemonic, Xi64, opnode, MemMRM>; + + // These are for the disassembler since 0x82 opcode behaves like 0x80, but + // not in 64-bit mode. + let Predicates = [Not64BitMode], isCodeGenOnly = 1, ForceDisassemble = 1, + hasSideEffects = 0 in { + def NAME#8ri8 : BinOpRI8_F<0x82, mnemonic, Xi8, null_frag, RegMRM>; + let mayLoad = 1 in + def NAME#8mi8 : BinOpMI8_F<mnemonic, Xi8, null_frag, MemMRM>; + } + } // Defs = [EFLAGS] + + def NAME#8i8 : BinOpAI_F<BaseOpc4, mnemonic, Xi8 , AL, + "{$src, %al|al, $src}">; + def NAME#16i16 : BinOpAI_F<BaseOpc4, mnemonic, Xi16, AX, + "{$src, %ax|ax, $src}">; + def NAME#32i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi32, EAX, + "{$src, %eax|eax, $src}">; + def NAME#64i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi64, RAX, + "{$src, %rax|rax, $src}">; +} + + +defm AND : ArithBinOp_RF<0x20, 0x22, 0x24, "and", MRM4r, MRM4m, + X86and_flag, and, 1, 0>; +defm OR : ArithBinOp_RF<0x08, 0x0A, 0x0C, "or", MRM1r, MRM1m, + X86or_flag, or, 1, 0>; +defm XOR : ArithBinOp_RF<0x30, 0x32, 0x34, "xor", MRM6r, MRM6m, + X86xor_flag, xor, 1, 0>; +defm ADD : ArithBinOp_RF<0x00, 0x02, 0x04, "add", MRM0r, MRM0m, + X86add_flag, add, 1, 1>; +let isCompare = 1 in { +defm SUB : ArithBinOp_RF<0x28, 0x2A, 0x2C, "sub", MRM5r, MRM5m, + X86sub_flag, sub, 0, 0>; +} + +// Arithmetic. +defm ADC : ArithBinOp_RFF<0x10, 0x12, 0x14, "adc", MRM2r, MRM2m, X86adc_flag, + 1, 0>; +defm SBB : ArithBinOp_RFF<0x18, 0x1A, 0x1C, "sbb", MRM3r, MRM3m, X86sbb_flag, + 0, 0>; + +let isCompare = 1 in { +defm CMP : ArithBinOp_F<0x38, 0x3A, 0x3C, "cmp", MRM7r, MRM7m, X86cmp, 0, 0>; +} + + +//===----------------------------------------------------------------------===// +// Semantically, test instructions are similar like AND, except they don't +// generate a result. From an encoding perspective, they are very different: +// they don't have all the usual imm8 and REV forms, and are encoded into a +// different space. +def X86testpat : PatFrag<(ops node:$lhs, node:$rhs), + (X86cmp (and_su node:$lhs, node:$rhs), 0)>; + +let isCompare = 1 in { + let Defs = [EFLAGS] in { + let isCommutable = 1 in { + def TEST8rr : BinOpRR_F<0x84, "test", Xi8 , X86testpat>; + def TEST16rr : BinOpRR_F<0x84, "test", Xi16, X86testpat>; + def TEST32rr : BinOpRR_F<0x84, "test", Xi32, X86testpat>; + def TEST64rr : BinOpRR_F<0x84, "test", Xi64, X86testpat>; + } // isCommutable + + def TEST8mr : BinOpMR_F<0x84, "test", Xi8 , X86testpat>; + def TEST16mr : BinOpMR_F<0x84, "test", Xi16, X86testpat>; + def TEST32mr : BinOpMR_F<0x84, "test", Xi32, X86testpat>; + def TEST64mr : BinOpMR_F<0x84, "test", Xi64, X86testpat>; + + def TEST8ri : BinOpRI_F<0xF6, "test", Xi8 , X86testpat, MRM0r>; + def TEST16ri : BinOpRI_F<0xF6, "test", Xi16, X86testpat, MRM0r>; + def TEST32ri : BinOpRI_F<0xF6, "test", Xi32, X86testpat, MRM0r>; + let Predicates = [In64BitMode] in + def TEST64ri32 : BinOpRI_F<0xF6, "test", Xi64, X86testpat, MRM0r>; + + def TEST8mi : BinOpMI_F<0xF6, "test", Xi8 , X86testpat, MRM0m>; + def TEST16mi : BinOpMI_F<0xF6, "test", Xi16, X86testpat, MRM0m>; + def TEST32mi : BinOpMI_F<0xF6, "test", Xi32, X86testpat, MRM0m>; + let Predicates = [In64BitMode] in + def TEST64mi32 : BinOpMI_F<0xF6, "test", Xi64, X86testpat, MRM0m>; + } // Defs = [EFLAGS] + + def TEST8i8 : BinOpAI_F<0xA8, "test", Xi8 , AL, + "{$src, %al|al, $src}">; + def TEST16i16 : BinOpAI_F<0xA8, "test", Xi16, AX, + "{$src, %ax|ax, $src}">; + def TEST32i32 : BinOpAI_F<0xA8, "test", Xi32, EAX, + "{$src, %eax|eax, $src}">; + def TEST64i32 : BinOpAI_F<0xA8, "test", Xi64, RAX, + "{$src, %rax|rax, $src}">; +} // isCompare + +//===----------------------------------------------------------------------===// +// ANDN Instruction +// +multiclass bmi_andn<string mnemonic, RegisterClass RC, X86MemOperand x86memop, + PatFrag ld_frag> { + def rr : I<0xF2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, EFLAGS, (X86and_flag (not RC:$src1), RC:$src2))]>, + Sched<[WriteALU]>; + def rm : I<0xF2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, EFLAGS, + (X86and_flag (not RC:$src1), (ld_frag addr:$src2)))]>, + Sched<[WriteALULd, ReadAfterLd]>; +} + +// Complexity is reduced to give and with immediate a chance to match first. +let Predicates = [HasBMI], Defs = [EFLAGS], AddedComplexity = -6 in { + defm ANDN32 : bmi_andn<"andn{l}", GR32, i32mem, loadi32>, T8PS, VEX_4V; + defm ANDN64 : bmi_andn<"andn{q}", GR64, i64mem, loadi64>, T8PS, VEX_4V, VEX_W; +} + +let Predicates = [HasBMI], AddedComplexity = -6 in { + def : Pat<(and (not GR32:$src1), GR32:$src2), + (ANDN32rr GR32:$src1, GR32:$src2)>; + def : Pat<(and (not GR64:$src1), GR64:$src2), + (ANDN64rr GR64:$src1, GR64:$src2)>; + def : Pat<(and (not GR32:$src1), (loadi32 addr:$src2)), + (ANDN32rm GR32:$src1, addr:$src2)>; + def : Pat<(and (not GR64:$src1), (loadi64 addr:$src2)), + (ANDN64rm GR64:$src1, addr:$src2)>; +} + +//===----------------------------------------------------------------------===// +// MULX Instruction +// +multiclass bmi_mulx<string mnemonic, RegisterClass RC, X86MemOperand x86memop, + X86FoldableSchedWrite sched> { +let hasSideEffects = 0 in { + let isCommutable = 1 in + def rr : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src), + !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"), + []>, T8XD, VEX_4V, Sched<[sched, WriteIMulH]>; + + let mayLoad = 1 in + def rm : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src), + !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"), + []>, T8XD, VEX_4V, Sched<[sched.Folded, WriteIMulH]>; +} +} + +let Predicates = [HasBMI2] in { + let Uses = [EDX] in + defm MULX32 : bmi_mulx<"mulx{l}", GR32, i32mem, WriteIMul>; + let Uses = [RDX] in + defm MULX64 : bmi_mulx<"mulx{q}", GR64, i64mem, WriteIMul64>, VEX_W; +} + +//===----------------------------------------------------------------------===// +// ADCX and ADOX Instructions +// +let Predicates = [HasADX], Defs = [EFLAGS], Uses = [EFLAGS], + Constraints = "$src1 = $dst", AddedComplexity = 10 in { + let SchedRW = [WriteADC] in { + def ADCX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst), + (ins GR32:$src1, GR32:$src2), + "adcx{l}\t{$src2, $dst|$dst, $src2}", + [(set GR32:$dst, EFLAGS, + (X86adc_flag GR32:$src1, GR32:$src2, EFLAGS))]>, T8PD; + def ADCX64rr : RI<0xF6, MRMSrcReg, (outs GR64:$dst), + (ins GR64:$src1, GR64:$src2), + "adcx{q}\t{$src2, $dst|$dst, $src2}", + [(set GR64:$dst, EFLAGS, + (X86adc_flag GR64:$src1, GR64:$src2, EFLAGS))]>, T8PD; + + // We don't have patterns for ADOX yet. + let hasSideEffects = 0 in { + def ADOX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst), + (ins GR32:$src1, GR32:$src2), + "adox{l}\t{$src2, $dst|$dst, $src2}", []>, T8XS; + + def ADOX64rr : RI<0xF6, MRMSrcReg, (outs GR64:$dst), + (ins GR64:$src1, GR64:$src2), + "adox{q}\t{$src2, $dst|$dst, $src2}", []>, T8XS; + } // hasSideEffects = 0 + } // SchedRW + + let mayLoad = 1, SchedRW = [WriteADCLd, ReadAfterLd] in { + def ADCX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), + (ins GR32:$src1, i32mem:$src2), + "adcx{l}\t{$src2, $dst|$dst, $src2}", + [(set GR32:$dst, EFLAGS, + (X86adc_flag GR32:$src1, (loadi32 addr:$src2), EFLAGS))]>, + T8PD; + + def ADCX64rm : RI<0xF6, MRMSrcMem, (outs GR64:$dst), + (ins GR64:$src1, i64mem:$src2), + "adcx{q}\t{$src2, $dst|$dst, $src2}", + [(set GR64:$dst, EFLAGS, + (X86adc_flag GR64:$src1, (loadi64 addr:$src2), EFLAGS))]>, + T8PD; + + // We don't have patterns for ADOX yet. + let hasSideEffects = 0 in { + def ADOX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), + (ins GR32:$src1, i32mem:$src2), + "adox{l}\t{$src2, $dst|$dst, $src2}", []>, T8XS; + + def ADOX64rm : RI<0xF6, MRMSrcMem, (outs GR64:$dst), + (ins GR64:$src1, i64mem:$src2), + "adox{q}\t{$src2, $dst|$dst, $src2}", []>, T8XS; + } // hasSideEffects = 0 + } // mayLoad = 1, SchedRW = [WriteADCLd] +} diff --git a/capstone/suite/synctools/tablegen/X86/back/X86InstrCMovSetCC.td b/capstone/suite/synctools/tablegen/X86/back/X86InstrCMovSetCC.td new file mode 100644 index 000000000..eda4ba5ae --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86InstrCMovSetCC.td @@ -0,0 +1,116 @@ +//===-- X86InstrCMovSetCC.td - Conditional Move and SetCC --*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 conditional move and set on condition +// instructions. +// +//===----------------------------------------------------------------------===// + + +// CMOV instructions. +multiclass CMOV<bits<8> opc, string Mnemonic, X86FoldableSchedWrite Sched, + PatLeaf CondNode> { + let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst", + isCommutable = 1, SchedRW = [Sched] in { + def NAME#16rr + : I<opc, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), + !strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"), + [(set GR16:$dst, + (X86cmov GR16:$src1, GR16:$src2, CondNode, EFLAGS))]>, + TB, OpSize16; + def NAME#32rr + : I<opc, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), + !strconcat(Mnemonic, "{l}\t{$src2, $dst|$dst, $src2}"), + [(set GR32:$dst, + (X86cmov GR32:$src1, GR32:$src2, CondNode, EFLAGS))]>, + TB, OpSize32; + def NAME#64rr + :RI<opc, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), + !strconcat(Mnemonic, "{q}\t{$src2, $dst|$dst, $src2}"), + [(set GR64:$dst, + (X86cmov GR64:$src1, GR64:$src2, CondNode, EFLAGS))]>, TB; + } + + let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst", + SchedRW = [Sched.Folded, ReadAfterLd] in { + def NAME#16rm + : I<opc, MRMSrcMem, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2), + !strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"), + [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2), + CondNode, EFLAGS))]>, TB, OpSize16; + def NAME#32rm + : I<opc, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2), + !strconcat(Mnemonic, "{l}\t{$src2, $dst|$dst, $src2}"), + [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2), + CondNode, EFLAGS))]>, TB, OpSize32; + def NAME#64rm + :RI<opc, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2), + !strconcat(Mnemonic, "{q}\t{$src2, $dst|$dst, $src2}"), + [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2), + CondNode, EFLAGS))]>, TB; + } // Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst" +} // end multiclass + + +// Conditional Moves. +defm CMOVO : CMOV<0x40, "cmovo" , WriteCMOV, X86_COND_O>; +defm CMOVNO : CMOV<0x41, "cmovno", WriteCMOV, X86_COND_NO>; +defm CMOVB : CMOV<0x42, "cmovb" , WriteCMOV, X86_COND_B>; +defm CMOVAE : CMOV<0x43, "cmovae", WriteCMOV, X86_COND_AE>; +defm CMOVE : CMOV<0x44, "cmove" , WriteCMOV, X86_COND_E>; +defm CMOVNE : CMOV<0x45, "cmovne", WriteCMOV, X86_COND_NE>; +defm CMOVBE : CMOV<0x46, "cmovbe", WriteCMOV2, X86_COND_BE>; +defm CMOVA : CMOV<0x47, "cmova" , WriteCMOV2, X86_COND_A>; +defm CMOVS : CMOV<0x48, "cmovs" , WriteCMOV, X86_COND_S>; +defm CMOVNS : CMOV<0x49, "cmovns", WriteCMOV, X86_COND_NS>; +defm CMOVP : CMOV<0x4A, "cmovp" , WriteCMOV, X86_COND_P>; +defm CMOVNP : CMOV<0x4B, "cmovnp", WriteCMOV, X86_COND_NP>; +defm CMOVL : CMOV<0x4C, "cmovl" , WriteCMOV, X86_COND_L>; +defm CMOVGE : CMOV<0x4D, "cmovge", WriteCMOV, X86_COND_GE>; +defm CMOVLE : CMOV<0x4E, "cmovle", WriteCMOV, X86_COND_LE>; +defm CMOVG : CMOV<0x4F, "cmovg" , WriteCMOV, X86_COND_G>; + + +// SetCC instructions. +multiclass SETCC<bits<8> opc, string Mnemonic, PatLeaf OpNode> { + let Uses = [EFLAGS] in { + def r : I<opc, MRMXr, (outs GR8:$dst), (ins), + !strconcat(Mnemonic, "\t$dst"), + [(set GR8:$dst, (X86setcc OpNode, EFLAGS))]>, + TB, Sched<[WriteSETCC]>; + def m : I<opc, MRMXm, (outs), (ins i8mem:$dst), + !strconcat(Mnemonic, "\t$dst"), + [(store (X86setcc OpNode, EFLAGS), addr:$dst)]>, + TB, Sched<[WriteSETCCStore]>; + } // Uses = [EFLAGS] +} + +defm SETO : SETCC<0x90, "seto", X86_COND_O>; // is overflow bit set +defm SETNO : SETCC<0x91, "setno", X86_COND_NO>; // is overflow bit not set +defm SETB : SETCC<0x92, "setb", X86_COND_B>; // unsigned less than +defm SETAE : SETCC<0x93, "setae", X86_COND_AE>; // unsigned greater or equal +defm SETE : SETCC<0x94, "sete", X86_COND_E>; // equal to +defm SETNE : SETCC<0x95, "setne", X86_COND_NE>; // not equal to +defm SETBE : SETCC<0x96, "setbe", X86_COND_BE>; // unsigned less than or equal +defm SETA : SETCC<0x97, "seta", X86_COND_A>; // unsigned greater than +defm SETS : SETCC<0x98, "sets", X86_COND_S>; // is signed bit set +defm SETNS : SETCC<0x99, "setns", X86_COND_NS>; // is not signed +defm SETP : SETCC<0x9A, "setp", X86_COND_P>; // is parity bit set +defm SETNP : SETCC<0x9B, "setnp", X86_COND_NP>; // is parity bit not set +defm SETL : SETCC<0x9C, "setl", X86_COND_L>; // signed less than +defm SETGE : SETCC<0x9D, "setge", X86_COND_GE>; // signed greater or equal +defm SETLE : SETCC<0x9E, "setle", X86_COND_LE>; // signed less than or equal +defm SETG : SETCC<0x9F, "setg", X86_COND_G>; // signed greater than + +// SALC is an undocumented instruction. Information for this instruction can be found +// here http://www.rcollins.org/secrets/opcodes/SALC.html +// Set AL if carry. +let Uses = [EFLAGS], Defs = [AL], SchedRW = [WriteALU] in { + def SALC : I<0xD6, RawFrm, (outs), (ins), "salc", []>, Requires<[Not64BitMode]>; +} diff --git a/capstone/suite/synctools/tablegen/X86/back/X86InstrCompiler.td b/capstone/suite/synctools/tablegen/X86/back/X86InstrCompiler.td new file mode 100644 index 000000000..373f85020 --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86InstrCompiler.td @@ -0,0 +1,2103 @@ +//===- X86InstrCompiler.td - Compiler Pseudos and Patterns -*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the various pseudo instructions used by the compiler, +// as well as Pat patterns used during instruction selection. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Pattern Matching Support + +def GetLo32XForm : SDNodeXForm<imm, [{ + // Transformation function: get the low 32 bits. + return getI32Imm((uint32_t)N->getZExtValue(), SDLoc(N)); +}]>; + +def GetLo8XForm : SDNodeXForm<imm, [{ + // Transformation function: get the low 8 bits. + return getI8Imm((uint8_t)N->getZExtValue(), SDLoc(N)); +}]>; + + +//===----------------------------------------------------------------------===// +// Random Pseudo Instructions. + +// PIC base construction. This expands to code that looks like this: +// call $next_inst +// popl %destreg" +let hasSideEffects = 0, isNotDuplicable = 1, Uses = [ESP, SSP], + SchedRW = [WriteJump] in + def MOVPC32r : Ii32<0xE8, Pseudo, (outs GR32:$reg), (ins i32imm:$label), + "", []>; + +// 64-bit large code model PIC base construction. +let hasSideEffects = 0, mayLoad = 1, isNotDuplicable = 1, SchedRW = [WriteJump] in + def MOVGOT64r : PseudoI<(outs GR64:$reg), + (ins GR64:$scratch, i64i32imm_pcrel:$got), []>; + +// ADJCALLSTACKDOWN/UP implicitly use/def ESP because they may be expanded into +// a stack adjustment and the codegen must know that they may modify the stack +// pointer before prolog-epilog rewriting occurs. +// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become +// sub / add which can clobber EFLAGS. +let Defs = [ESP, EFLAGS, SSP], Uses = [ESP, SSP], SchedRW = [WriteALU] in { +def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), + (ins i32imm:$amt1, i32imm:$amt2, i32imm:$amt3), + "#ADJCALLSTACKDOWN", []>, Requires<[NotLP64]>; +def ADJCALLSTACKUP32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), + "#ADJCALLSTACKUP", + [(X86callseq_end timm:$amt1, timm:$amt2)]>, + Requires<[NotLP64]>; +} +def : Pat<(X86callseq_start timm:$amt1, timm:$amt2), + (ADJCALLSTACKDOWN32 i32imm:$amt1, i32imm:$amt2, 0)>, Requires<[NotLP64]>; + + +// ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into +// a stack adjustment and the codegen must know that they may modify the stack +// pointer before prolog-epilog rewriting occurs. +// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become +// sub / add which can clobber EFLAGS. +let Defs = [RSP, EFLAGS, SSP], Uses = [RSP, SSP], SchedRW = [WriteALU] in { +def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), + (ins i32imm:$amt1, i32imm:$amt2, i32imm:$amt3), + "#ADJCALLSTACKDOWN", []>, Requires<[IsLP64]>; +def ADJCALLSTACKUP64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), + "#ADJCALLSTACKUP", + [(X86callseq_end timm:$amt1, timm:$amt2)]>, + Requires<[IsLP64]>; +} +def : Pat<(X86callseq_start timm:$amt1, timm:$amt2), + (ADJCALLSTACKDOWN64 i32imm:$amt1, i32imm:$amt2, 0)>, Requires<[IsLP64]>; + +let SchedRW = [WriteSystem] in { + +// x86-64 va_start lowering magic. +let usesCustomInserter = 1, Defs = [EFLAGS] in { +def VASTART_SAVE_XMM_REGS : I<0, Pseudo, + (outs), + (ins GR8:$al, + i64imm:$regsavefi, i64imm:$offset, + variable_ops), + "#VASTART_SAVE_XMM_REGS $al, $regsavefi, $offset", + [(X86vastart_save_xmm_regs GR8:$al, + imm:$regsavefi, + imm:$offset), + (implicit EFLAGS)]>; + +// The VAARG_64 pseudo-instruction takes the address of the va_list, +// and places the address of the next argument into a register. +let Defs = [EFLAGS] in +def VAARG_64 : I<0, Pseudo, + (outs GR64:$dst), + (ins i8mem:$ap, i32imm:$size, i8imm:$mode, i32imm:$align), + "#VAARG_64 $dst, $ap, $size, $mode, $align", + [(set GR64:$dst, + (X86vaarg64 addr:$ap, imm:$size, imm:$mode, imm:$align)), + (implicit EFLAGS)]>; + + +// When using segmented stacks these are lowered into instructions which first +// check if the current stacklet has enough free memory. If it does, memory is +// allocated by bumping the stack pointer. Otherwise memory is allocated from +// the heap. + +let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in +def SEG_ALLOCA_32 : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$size), + "# variable sized alloca for segmented stacks", + [(set GR32:$dst, + (X86SegAlloca GR32:$size))]>, + Requires<[NotLP64]>; + +let Defs = [RAX, RSP, EFLAGS], Uses = [RSP] in +def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size), + "# variable sized alloca for segmented stacks", + [(set GR64:$dst, + (X86SegAlloca GR64:$size))]>, + Requires<[In64BitMode]>; +} + +// Dynamic stack allocation yields a _chkstk or _alloca call for all Windows +// targets. These calls are needed to probe the stack when allocating more than +// 4k bytes in one go. Touching the stack at 4K increments is necessary to +// ensure that the guard pages used by the OS virtual memory manager are +// allocated in correct sequence. +// The main point of having separate instruction are extra unmodelled effects +// (compared to ordinary calls) like stack pointer change. + +let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in +def WIN_ALLOCA_32 : I<0, Pseudo, (outs), (ins GR32:$size), + "# dynamic stack allocation", + [(X86WinAlloca GR32:$size)]>, + Requires<[NotLP64]>; + +let Defs = [RAX, RSP, EFLAGS], Uses = [RSP] in +def WIN_ALLOCA_64 : I<0, Pseudo, (outs), (ins GR64:$size), + "# dynamic stack allocation", + [(X86WinAlloca GR64:$size)]>, + Requires<[In64BitMode]>; +} // SchedRW + +// These instructions XOR the frame pointer into a GPR. They are used in some +// stack protection schemes. These are post-RA pseudos because we only know the +// frame register after register allocation. +let Constraints = "$src = $dst", isPseudo = 1, Defs = [EFLAGS] in { + def XOR32_FP : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src), + "xorl\t$$FP, $src", []>, + Requires<[NotLP64]>, Sched<[WriteALU]>; + def XOR64_FP : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$src), + "xorq\t$$FP $src", []>, + Requires<[In64BitMode]>, Sched<[WriteALU]>; +} + +//===----------------------------------------------------------------------===// +// EH Pseudo Instructions +// +let SchedRW = [WriteSystem] in { +let isTerminator = 1, isReturn = 1, isBarrier = 1, + hasCtrlDep = 1, isCodeGenOnly = 1 in { +def EH_RETURN : I<0xC3, RawFrm, (outs), (ins GR32:$addr), + "ret\t#eh_return, addr: $addr", + [(X86ehret GR32:$addr)]>, Sched<[WriteJumpLd]>; + +} + +let isTerminator = 1, isReturn = 1, isBarrier = 1, + hasCtrlDep = 1, isCodeGenOnly = 1 in { +def EH_RETURN64 : I<0xC3, RawFrm, (outs), (ins GR64:$addr), + "ret\t#eh_return, addr: $addr", + [(X86ehret GR64:$addr)]>, Sched<[WriteJumpLd]>; + +} + +let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1, + isCodeGenOnly = 1, isReturn = 1 in { + def CLEANUPRET : I<0, Pseudo, (outs), (ins), "# CLEANUPRET", [(cleanupret)]>; + + // CATCHRET needs a custom inserter for SEH. + let usesCustomInserter = 1 in + def CATCHRET : I<0, Pseudo, (outs), (ins brtarget32:$dst, brtarget32:$from), + "# CATCHRET", + [(catchret bb:$dst, bb:$from)]>; +} + +let hasSideEffects = 1, hasCtrlDep = 1, isCodeGenOnly = 1, + usesCustomInserter = 1 in +def CATCHPAD : I<0, Pseudo, (outs), (ins), "# CATCHPAD", [(catchpad)]>; + +// This instruction is responsible for re-establishing stack pointers after an +// exception has been caught and we are rejoining normal control flow in the +// parent function or funclet. It generally sets ESP and EBP, and optionally +// ESI. It is only needed for 32-bit WinEH, as the runtime restores CSRs for us +// elsewhere. +let hasSideEffects = 1, hasCtrlDep = 1, isCodeGenOnly = 1 in +def EH_RESTORE : I<0, Pseudo, (outs), (ins), "# EH_RESTORE", []>; + +let hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1, + usesCustomInserter = 1 in { + def EH_SjLj_SetJmp32 : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$buf), + "#EH_SJLJ_SETJMP32", + [(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>, + Requires<[Not64BitMode]>; + def EH_SjLj_SetJmp64 : I<0, Pseudo, (outs GR32:$dst), (ins i64mem:$buf), + "#EH_SJLJ_SETJMP64", + [(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>, + Requires<[In64BitMode]>; + let isTerminator = 1 in { + def EH_SjLj_LongJmp32 : I<0, Pseudo, (outs), (ins i32mem:$buf), + "#EH_SJLJ_LONGJMP32", + [(X86eh_sjlj_longjmp addr:$buf)]>, + Requires<[Not64BitMode]>; + def EH_SjLj_LongJmp64 : I<0, Pseudo, (outs), (ins i64mem:$buf), + "#EH_SJLJ_LONGJMP64", + [(X86eh_sjlj_longjmp addr:$buf)]>, + Requires<[In64BitMode]>; + } +} + +let isBranch = 1, isTerminator = 1, isCodeGenOnly = 1 in { + def EH_SjLj_Setup : I<0, Pseudo, (outs), (ins brtarget:$dst), + "#EH_SjLj_Setup\t$dst", []>; +} +} // SchedRW + +//===----------------------------------------------------------------------===// +// Pseudo instructions used by unwind info. +// +let isPseudo = 1, SchedRW = [WriteSystem] in { + def SEH_PushReg : I<0, Pseudo, (outs), (ins i32imm:$reg), + "#SEH_PushReg $reg", []>; + def SEH_SaveReg : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst), + "#SEH_SaveReg $reg, $dst", []>; + def SEH_SaveXMM : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst), + "#SEH_SaveXMM $reg, $dst", []>; + def SEH_StackAlloc : I<0, Pseudo, (outs), (ins i32imm:$size), + "#SEH_StackAlloc $size", []>; + def SEH_SetFrame : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$offset), + "#SEH_SetFrame $reg, $offset", []>; + def SEH_PushFrame : I<0, Pseudo, (outs), (ins i1imm:$mode), + "#SEH_PushFrame $mode", []>; + def SEH_EndPrologue : I<0, Pseudo, (outs), (ins), + "#SEH_EndPrologue", []>; + def SEH_Epilogue : I<0, Pseudo, (outs), (ins), + "#SEH_Epilogue", []>; +} + +//===----------------------------------------------------------------------===// +// Pseudo instructions used by segmented stacks. +// + +// This is lowered into a RET instruction by MCInstLower. We need +// this so that we don't have to have a MachineBasicBlock which ends +// with a RET and also has successors. +let isPseudo = 1, SchedRW = [WriteJumpLd] in { +def MORESTACK_RET: I<0, Pseudo, (outs), (ins), "", []>; + +// This instruction is lowered to a RET followed by a MOV. The two +// instructions are not generated on a higher level since then the +// verifier sees a MachineBasicBlock ending with a non-terminator. +def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins), "", []>; +} + +//===----------------------------------------------------------------------===// +// Alias Instructions +//===----------------------------------------------------------------------===// + +// Alias instruction mapping movr0 to xor. +// FIXME: remove when we can teach regalloc that xor reg, reg is ok. +let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1, + isPseudo = 1, AddedComplexity = 10 in +def MOV32r0 : I<0, Pseudo, (outs GR32:$dst), (ins), "", + [(set GR32:$dst, 0)]>, Sched<[WriteZero]>; + +// Other widths can also make use of the 32-bit xor, which may have a smaller +// encoding and avoid partial register updates. +let AddedComplexity = 10 in { +def : Pat<(i8 0), (EXTRACT_SUBREG (MOV32r0), sub_8bit)>; +def : Pat<(i16 0), (EXTRACT_SUBREG (MOV32r0), sub_16bit)>; +def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)>; +} + +let Predicates = [OptForSize, Not64BitMode], + AddedComplexity = 10 in { + let SchedRW = [WriteALU] in { + // Pseudo instructions for materializing 1 and -1 using XOR+INC/DEC, + // which only require 3 bytes compared to MOV32ri which requires 5. + let Defs = [EFLAGS], isReMaterializable = 1, isPseudo = 1 in { + def MOV32r1 : I<0, Pseudo, (outs GR32:$dst), (ins), "", + [(set GR32:$dst, 1)]>; + def MOV32r_1 : I<0, Pseudo, (outs GR32:$dst), (ins), "", + [(set GR32:$dst, -1)]>; + } + } // SchedRW + + // MOV16ri is 4 bytes, so the instructions above are smaller. + def : Pat<(i16 1), (EXTRACT_SUBREG (MOV32r1), sub_16bit)>; + def : Pat<(i16 -1), (EXTRACT_SUBREG (MOV32r_1), sub_16bit)>; +} + +let isReMaterializable = 1, isPseudo = 1, AddedComplexity = 5, + SchedRW = [WriteALU] in { +// AddedComplexity higher than MOV64ri but lower than MOV32r0 and MOV32r1. +def MOV32ImmSExti8 : I<0, Pseudo, (outs GR32:$dst), (ins i32i8imm:$src), "", + [(set GR32:$dst, i32immSExt8:$src)]>, + Requires<[OptForMinSize, NotWin64WithoutFP]>; +def MOV64ImmSExti8 : I<0, Pseudo, (outs GR64:$dst), (ins i64i8imm:$src), "", + [(set GR64:$dst, i64immSExt8:$src)]>, + Requires<[OptForMinSize, NotWin64WithoutFP]>; +} + +// Materialize i64 constant where top 32-bits are zero. This could theoretically +// use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however +// that would make it more difficult to rematerialize. +let isReMaterializable = 1, isAsCheapAsAMove = 1, + isPseudo = 1, hasSideEffects = 0, SchedRW = [WriteMove] in +def MOV32ri64 : I<0, Pseudo, (outs GR32:$dst), (ins i64i32imm:$src), "", []>; + +// This 64-bit pseudo-move can be used for both a 64-bit constant that is +// actually the zero-extension of a 32-bit constant and for labels in the +// x86-64 small code model. +def mov64imm32 : ComplexPattern<i64, 1, "selectMOV64Imm32", [imm, X86Wrapper]>; + +let AddedComplexity = 1 in +def : Pat<(i64 mov64imm32:$src), + (SUBREG_TO_REG (i64 0), (MOV32ri64 mov64imm32:$src), sub_32bit)>; + +// Use sbb to materialize carry bit. +let Uses = [EFLAGS], Defs = [EFLAGS], isPseudo = 1, SchedRW = [WriteALU] in { +// FIXME: These are pseudo ops that should be replaced with Pat<> patterns. +// However, Pat<> can't replicate the destination reg into the inputs of the +// result. +def SETB_C8r : I<0, Pseudo, (outs GR8:$dst), (ins), "", + [(set GR8:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>; +def SETB_C16r : I<0, Pseudo, (outs GR16:$dst), (ins), "", + [(set GR16:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>; +def SETB_C32r : I<0, Pseudo, (outs GR32:$dst), (ins), "", + [(set GR32:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>; +def SETB_C64r : I<0, Pseudo, (outs GR64:$dst), (ins), "", + [(set GR64:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>; +} // isCodeGenOnly + + +def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C16r)>; +def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C32r)>; +def : Pat<(i64 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C64r)>; + +def : Pat<(i16 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C16r)>; +def : Pat<(i32 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C32r)>; +def : Pat<(i64 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C64r)>; + +// We canonicalize 'setb' to "(and (sbb reg,reg), 1)" on the hope that the and +// will be eliminated and that the sbb can be extended up to a wider type. When +// this happens, it is great. However, if we are left with an 8-bit sbb and an +// and, we might as well just match it as a setb. +def : Pat<(and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1), + (SETBr)>; + +// (add OP, SETB) -> (adc OP, 0) +def : Pat<(add (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR8:$op), + (ADC8ri GR8:$op, 0)>; +def : Pat<(add (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR32:$op), + (ADC32ri8 GR32:$op, 0)>; +def : Pat<(add (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR64:$op), + (ADC64ri8 GR64:$op, 0)>; + +// (sub OP, SETB) -> (sbb OP, 0) +def : Pat<(sub GR8:$op, (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1)), + (SBB8ri GR8:$op, 0)>; +def : Pat<(sub GR32:$op, (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1)), + (SBB32ri8 GR32:$op, 0)>; +def : Pat<(sub GR64:$op, (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1)), + (SBB64ri8 GR64:$op, 0)>; + +// (sub OP, SETCC_CARRY) -> (adc OP, 0) +def : Pat<(sub GR8:$op, (i8 (X86setcc_c X86_COND_B, EFLAGS))), + (ADC8ri GR8:$op, 0)>; +def : Pat<(sub GR32:$op, (i32 (X86setcc_c X86_COND_B, EFLAGS))), + (ADC32ri8 GR32:$op, 0)>; +def : Pat<(sub GR64:$op, (i64 (X86setcc_c X86_COND_B, EFLAGS))), + (ADC64ri8 GR64:$op, 0)>; + +//===----------------------------------------------------------------------===// +// String Pseudo Instructions +// +let SchedRW = [WriteMicrocoded] in { +let Defs = [ECX,EDI,ESI], Uses = [ECX,EDI,ESI], isCodeGenOnly = 1 in { +def REP_MOVSB_32 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}", + [(X86rep_movs i8)]>, REP, + Requires<[Not64BitMode]>; +def REP_MOVSW_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}", + [(X86rep_movs i16)]>, REP, OpSize16, + Requires<[Not64BitMode]>; +def REP_MOVSD_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}", + [(X86rep_movs i32)]>, REP, OpSize32, + Requires<[Not64BitMode]>; +} + +let Defs = [RCX,RDI,RSI], Uses = [RCX,RDI,RSI], isCodeGenOnly = 1 in { +def REP_MOVSB_64 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}", + [(X86rep_movs i8)]>, REP, + Requires<[In64BitMode]>; +def REP_MOVSW_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}", + [(X86rep_movs i16)]>, REP, OpSize16, + Requires<[In64BitMode]>; +def REP_MOVSD_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}", + [(X86rep_movs i32)]>, REP, OpSize32, + Requires<[In64BitMode]>; +def REP_MOVSQ_64 : RI<0xA5, RawFrm, (outs), (ins), "{rep;movsq|rep movsq}", + [(X86rep_movs i64)]>, REP, + Requires<[In64BitMode]>; +} + +// FIXME: Should use "(X86rep_stos AL)" as the pattern. +let Defs = [ECX,EDI], isCodeGenOnly = 1 in { + let Uses = [AL,ECX,EDI] in + def REP_STOSB_32 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}", + [(X86rep_stos i8)]>, REP, + Requires<[Not64BitMode]>; + let Uses = [AX,ECX,EDI] in + def REP_STOSW_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}", + [(X86rep_stos i16)]>, REP, OpSize16, + Requires<[Not64BitMode]>; + let Uses = [EAX,ECX,EDI] in + def REP_STOSD_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}", + [(X86rep_stos i32)]>, REP, OpSize32, + Requires<[Not64BitMode]>; +} + +let Defs = [RCX,RDI], isCodeGenOnly = 1 in { + let Uses = [AL,RCX,RDI] in + def REP_STOSB_64 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}", + [(X86rep_stos i8)]>, REP, + Requires<[In64BitMode]>; + let Uses = [AX,RCX,RDI] in + def REP_STOSW_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}", + [(X86rep_stos i16)]>, REP, OpSize16, + Requires<[In64BitMode]>; + let Uses = [RAX,RCX,RDI] in + def REP_STOSD_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}", + [(X86rep_stos i32)]>, REP, OpSize32, + Requires<[In64BitMode]>; + + let Uses = [RAX,RCX,RDI] in + def REP_STOSQ_64 : RI<0xAB, RawFrm, (outs), (ins), "{rep;stosq|rep stosq}", + [(X86rep_stos i64)]>, REP, + Requires<[In64BitMode]>; +} +} // SchedRW + +//===----------------------------------------------------------------------===// +// Thread Local Storage Instructions +// +let SchedRW = [WriteSystem] in { + +// ELF TLS Support +// All calls clobber the non-callee saved registers. ESP is marked as +// a use to prevent stack-pointer assignments that appear immediately +// before calls from potentially appearing dead. +let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7, + ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7, + MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, + XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, + XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS, DF], + usesCustomInserter = 1, Uses = [ESP, SSP] in { +def TLS_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym), + "# TLS_addr32", + [(X86tlsaddr tls32addr:$sym)]>, + Requires<[Not64BitMode]>; +def TLS_base_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym), + "# TLS_base_addr32", + [(X86tlsbaseaddr tls32baseaddr:$sym)]>, + Requires<[Not64BitMode]>; +} + +// All calls clobber the non-callee saved registers. RSP is marked as +// a use to prevent stack-pointer assignments that appear immediately +// before calls from potentially appearing dead. +let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11, + FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7, + ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7, + MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, + XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, + XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS, DF], + usesCustomInserter = 1, Uses = [RSP, SSP] in { +def TLS_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym), + "# TLS_addr64", + [(X86tlsaddr tls64addr:$sym)]>, + Requires<[In64BitMode]>; +def TLS_base_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym), + "# TLS_base_addr64", + [(X86tlsbaseaddr tls64baseaddr:$sym)]>, + Requires<[In64BitMode]>; +} + +// Darwin TLS Support +// For i386, the address of the thunk is passed on the stack, on return the +// address of the variable is in %eax. %ecx is trashed during the function +// call. All other registers are preserved. +let Defs = [EAX, ECX, EFLAGS, DF], + Uses = [ESP, SSP], + usesCustomInserter = 1 in +def TLSCall_32 : I<0, Pseudo, (outs), (ins i32mem:$sym), + "# TLSCall_32", + [(X86TLSCall addr:$sym)]>, + Requires<[Not64BitMode]>; + +// For x86_64, the address of the thunk is passed in %rdi, but the +// pseudo directly use the symbol, so do not add an implicit use of +// %rdi. The lowering will do the right thing with RDI. +// On return the address of the variable is in %rax. All other +// registers are preserved. +let Defs = [RAX, EFLAGS, DF], + Uses = [RSP, SSP], + usesCustomInserter = 1 in +def TLSCall_64 : I<0, Pseudo, (outs), (ins i64mem:$sym), + "# TLSCall_64", + [(X86TLSCall addr:$sym)]>, + Requires<[In64BitMode]>; +} // SchedRW + +//===----------------------------------------------------------------------===// +// Conditional Move Pseudo Instructions + +// CMOV* - Used to implement the SELECT DAG operation. Expanded after +// instruction selection into a branch sequence. +multiclass CMOVrr_PSEUDO<RegisterClass RC, ValueType VT> { + def CMOV#NAME : I<0, Pseudo, + (outs RC:$dst), (ins RC:$t, RC:$f, i8imm:$cond), + "#CMOV_"#NAME#" PSEUDO!", + [(set RC:$dst, (VT (X86cmov RC:$t, RC:$f, imm:$cond, + EFLAGS)))]>; +} + +let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS] in { + // X86 doesn't have 8-bit conditional moves. Use a customInserter to + // emit control flow. An alternative to this is to mark i8 SELECT as Promote, + // however that requires promoting the operands, and can induce additional + // i8 register pressure. + defm _GR8 : CMOVrr_PSEUDO<GR8, i8>; + + let Predicates = [NoCMov] in { + defm _GR32 : CMOVrr_PSEUDO<GR32, i32>; + defm _GR16 : CMOVrr_PSEUDO<GR16, i16>; + } // Predicates = [NoCMov] + + // fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no + // SSE1/SSE2. + let Predicates = [FPStackf32] in + defm _RFP32 : CMOVrr_PSEUDO<RFP32, f32>; + + let Predicates = [FPStackf64] in + defm _RFP64 : CMOVrr_PSEUDO<RFP64, f64>; + + defm _RFP80 : CMOVrr_PSEUDO<RFP80, f80>; + + defm _FR32 : CMOVrr_PSEUDO<FR32, f32>; + defm _FR64 : CMOVrr_PSEUDO<FR64, f64>; + defm _F128 : CMOVrr_PSEUDO<VR128, f128>; + defm _V4F32 : CMOVrr_PSEUDO<VR128, v4f32>; + defm _V2F64 : CMOVrr_PSEUDO<VR128, v2f64>; + defm _V2I64 : CMOVrr_PSEUDO<VR128, v2i64>; + defm _V8F32 : CMOVrr_PSEUDO<VR256, v8f32>; + defm _V4F64 : CMOVrr_PSEUDO<VR256, v4f64>; + defm _V4I64 : CMOVrr_PSEUDO<VR256, v4i64>; + defm _V8I64 : CMOVrr_PSEUDO<VR512, v8i64>; + defm _V8F64 : CMOVrr_PSEUDO<VR512, v8f64>; + defm _V16F32 : CMOVrr_PSEUDO<VR512, v16f32>; + defm _V8I1 : CMOVrr_PSEUDO<VK8, v8i1>; + defm _V16I1 : CMOVrr_PSEUDO<VK16, v16i1>; + defm _V32I1 : CMOVrr_PSEUDO<VK32, v32i1>; + defm _V64I1 : CMOVrr_PSEUDO<VK64, v64i1>; +} // usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS] + +//===----------------------------------------------------------------------===// +// Normal-Instructions-With-Lock-Prefix Pseudo Instructions +//===----------------------------------------------------------------------===// + +// FIXME: Use normal instructions and add lock prefix dynamically. + +// Memory barriers + +// TODO: Get this to fold the constant into the instruction. +let isCodeGenOnly = 1, Defs = [EFLAGS] in +def OR32mrLocked : I<0x09, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$zero), + "or{l}\t{$zero, $dst|$dst, $zero}", []>, + Requires<[Not64BitMode]>, OpSize32, LOCK, + Sched<[WriteALULd, WriteRMW]>; + +let hasSideEffects = 1 in +def Int_MemBarrier : I<0, Pseudo, (outs), (ins), + "#MEMBARRIER", + [(X86MemBarrier)]>, Sched<[WriteLoad]>; + +// RegOpc corresponds to the mr version of the instruction +// ImmOpc corresponds to the mi version of the instruction +// ImmOpc8 corresponds to the mi8 version of the instruction +// ImmMod corresponds to the instruction format of the mi and mi8 versions +multiclass LOCK_ArithBinOp<bits<8> RegOpc, bits<8> ImmOpc, bits<8> ImmOpc8, + Format ImmMod, SDNode Op, string mnemonic> { +let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1, + SchedRW = [WriteALULd, WriteRMW] in { + +def NAME#8mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, + RegOpc{3}, RegOpc{2}, RegOpc{1}, 0 }, + MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2), + !strconcat(mnemonic, "{b}\t", + "{$src2, $dst|$dst, $src2}"), + [(set EFLAGS, (Op addr:$dst, GR8:$src2))]>, LOCK; + +def NAME#16mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, + RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 }, + MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2), + !strconcat(mnemonic, "{w}\t", + "{$src2, $dst|$dst, $src2}"), + [(set EFLAGS, (Op addr:$dst, GR16:$src2))]>, + OpSize16, LOCK; + +def NAME#32mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, + RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 }, + MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), + !strconcat(mnemonic, "{l}\t", + "{$src2, $dst|$dst, $src2}"), + [(set EFLAGS, (Op addr:$dst, GR32:$src2))]>, + OpSize32, LOCK; + +def NAME#64mr : RI<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, + RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 }, + MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), + !strconcat(mnemonic, "{q}\t", + "{$src2, $dst|$dst, $src2}"), + [(set EFLAGS, (Op addr:$dst, GR64:$src2))]>, LOCK; + +def NAME#8mi : Ii8<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, + ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 0 }, + ImmMod, (outs), (ins i8mem :$dst, i8imm :$src2), + !strconcat(mnemonic, "{b}\t", + "{$src2, $dst|$dst, $src2}"), + [(set EFLAGS, (Op addr:$dst, (i8 imm:$src2)))]>, LOCK; + +def NAME#16mi : Ii16<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, + ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, + ImmMod, (outs), (ins i16mem :$dst, i16imm :$src2), + !strconcat(mnemonic, "{w}\t", + "{$src2, $dst|$dst, $src2}"), + [(set EFLAGS, (Op addr:$dst, (i16 imm:$src2)))]>, + OpSize16, LOCK; + +def NAME#32mi : Ii32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, + ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, + ImmMod, (outs), (ins i32mem :$dst, i32imm :$src2), + !strconcat(mnemonic, "{l}\t", + "{$src2, $dst|$dst, $src2}"), + [(set EFLAGS, (Op addr:$dst, (i32 imm:$src2)))]>, + OpSize32, LOCK; + +def NAME#64mi32 : RIi32S<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, + ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, + ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2), + !strconcat(mnemonic, "{q}\t", + "{$src2, $dst|$dst, $src2}"), + [(set EFLAGS, (Op addr:$dst, i64immSExt32:$src2))]>, + LOCK; + +def NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, + ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, + ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2), + !strconcat(mnemonic, "{w}\t", + "{$src2, $dst|$dst, $src2}"), + [(set EFLAGS, (Op addr:$dst, i16immSExt8:$src2))]>, + OpSize16, LOCK; + +def NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, + ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, + ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2), + !strconcat(mnemonic, "{l}\t", + "{$src2, $dst|$dst, $src2}"), + [(set EFLAGS, (Op addr:$dst, i32immSExt8:$src2))]>, + OpSize32, LOCK; + +def NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, + ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, + ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2), + !strconcat(mnemonic, "{q}\t", + "{$src2, $dst|$dst, $src2}"), + [(set EFLAGS, (Op addr:$dst, i64immSExt8:$src2))]>, + LOCK; +} + +} + +defm LOCK_ADD : LOCK_ArithBinOp<0x00, 0x80, 0x83, MRM0m, X86lock_add, "add">; +defm LOCK_SUB : LOCK_ArithBinOp<0x28, 0x80, 0x83, MRM5m, X86lock_sub, "sub">; +defm LOCK_OR : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM1m, X86lock_or , "or">; +defm LOCK_AND : LOCK_ArithBinOp<0x20, 0x80, 0x83, MRM4m, X86lock_and, "and">; +defm LOCK_XOR : LOCK_ArithBinOp<0x30, 0x80, 0x83, MRM6m, X86lock_xor, "xor">; + +multiclass LOCK_ArithUnOp<bits<8> Opc8, bits<8> Opc, Format Form, + string frag, string mnemonic> { +let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1, + SchedRW = [WriteALULd, WriteRMW] in { +def NAME#8m : I<Opc8, Form, (outs), (ins i8mem :$dst), + !strconcat(mnemonic, "{b}\t$dst"), + [(set EFLAGS, (!cast<PatFrag>(frag # "_8") addr:$dst))]>, + LOCK; +def NAME#16m : I<Opc, Form, (outs), (ins i16mem:$dst), + !strconcat(mnemonic, "{w}\t$dst"), + [(set EFLAGS, (!cast<PatFrag>(frag # "_16") addr:$dst))]>, + OpSize16, LOCK; +def NAME#32m : I<Opc, Form, (outs), (ins i32mem:$dst), + !strconcat(mnemonic, "{l}\t$dst"), + [(set EFLAGS, (!cast<PatFrag>(frag # "_32") addr:$dst))]>, + OpSize32, LOCK; +def NAME#64m : RI<Opc, Form, (outs), (ins i64mem:$dst), + !strconcat(mnemonic, "{q}\t$dst"), + [(set EFLAGS, (!cast<PatFrag>(frag # "_64") addr:$dst))]>, + LOCK; +} +} + +multiclass unary_atomic_intrin<SDNode atomic_op> { + def _8 : PatFrag<(ops node:$ptr), + (atomic_op node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8; + }]>; + def _16 : PatFrag<(ops node:$ptr), + (atomic_op node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16; + }]>; + def _32 : PatFrag<(ops node:$ptr), + (atomic_op node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32; + }]>; + def _64 : PatFrag<(ops node:$ptr), + (atomic_op node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64; + }]>; +} + +defm X86lock_inc : unary_atomic_intrin<X86lock_inc>; +defm X86lock_dec : unary_atomic_intrin<X86lock_dec>; + +defm LOCK_INC : LOCK_ArithUnOp<0xFE, 0xFF, MRM0m, "X86lock_inc", "inc">; +defm LOCK_DEC : LOCK_ArithUnOp<0xFE, 0xFF, MRM1m, "X86lock_dec", "dec">; + +// Atomic compare and swap. +multiclass LCMPXCHG_UnOp<bits<8> Opc, Format Form, string mnemonic, + SDPatternOperator frag, X86MemOperand x86memop> { +let isCodeGenOnly = 1, usesCustomInserter = 1 in { + def NAME : I<Opc, Form, (outs), (ins x86memop:$ptr), + !strconcat(mnemonic, "\t$ptr"), + [(frag addr:$ptr)]>, TB, LOCK; +} +} + +multiclass LCMPXCHG_BinOp<bits<8> Opc8, bits<8> Opc, Format Form, + string mnemonic, SDPatternOperator frag> { +let isCodeGenOnly = 1, SchedRW = [WriteALULd, WriteRMW] in { + let Defs = [AL, EFLAGS], Uses = [AL] in + def NAME#8 : I<Opc8, Form, (outs), (ins i8mem:$ptr, GR8:$swap), + !strconcat(mnemonic, "{b}\t{$swap, $ptr|$ptr, $swap}"), + [(frag addr:$ptr, GR8:$swap, 1)]>, TB, LOCK; + let Defs = [AX, EFLAGS], Uses = [AX] in + def NAME#16 : I<Opc, Form, (outs), (ins i16mem:$ptr, GR16:$swap), + !strconcat(mnemonic, "{w}\t{$swap, $ptr|$ptr, $swap}"), + [(frag addr:$ptr, GR16:$swap, 2)]>, TB, OpSize16, LOCK; + let Defs = [EAX, EFLAGS], Uses = [EAX] in + def NAME#32 : I<Opc, Form, (outs), (ins i32mem:$ptr, GR32:$swap), + !strconcat(mnemonic, "{l}\t{$swap, $ptr|$ptr, $swap}"), + [(frag addr:$ptr, GR32:$swap, 4)]>, TB, OpSize32, LOCK; + let Defs = [RAX, EFLAGS], Uses = [RAX] in + def NAME#64 : RI<Opc, Form, (outs), (ins i64mem:$ptr, GR64:$swap), + !strconcat(mnemonic, "{q}\t{$swap, $ptr|$ptr, $swap}"), + [(frag addr:$ptr, GR64:$swap, 8)]>, TB, LOCK; +} +} + +let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX], + SchedRW = [WriteALULd, WriteRMW] in { +defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b", X86cas8, i64mem>; +} + +// This pseudo must be used when the frame uses RBX as +// the base pointer. Indeed, in such situation RBX is a reserved +// register and the register allocator will ignore any use/def of +// it. In other words, the register will not fix the clobbering of +// RBX that will happen when setting the arguments for the instrucion. +// +// Unlike the actual related instuction, we mark that this one +// defines EBX (instead of using EBX). +// The rationale is that we will define RBX during the expansion of +// the pseudo. The argument feeding EBX is ebx_input. +// +// The additional argument, $ebx_save, is a temporary register used to +// save the value of RBX across the actual instruction. +// +// To make sure the register assigned to $ebx_save does not interfere with +// the definition of the actual instruction, we use a definition $dst which +// is tied to $rbx_save. That way, the live-range of $rbx_save spans across +// the instruction and we are sure we will have a valid register to restore +// the value of RBX. +let Defs = [EAX, EDX, EBX, EFLAGS], Uses = [EAX, ECX, EDX], + SchedRW = [WriteALULd, WriteRMW], isCodeGenOnly = 1, isPseudo = 1, + Constraints = "$ebx_save = $dst", usesCustomInserter = 1 in { +def LCMPXCHG8B_SAVE_EBX : + I<0, Pseudo, (outs GR32:$dst), + (ins i64mem:$ptr, GR32:$ebx_input, GR32:$ebx_save), + !strconcat("cmpxchg8b", "\t$ptr"), + [(set GR32:$dst, (X86cas8save_ebx addr:$ptr, GR32:$ebx_input, + GR32:$ebx_save))]>; +} + + +let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX], + Predicates = [HasCmpxchg16b], SchedRW = [WriteALULd, WriteRMW] in { +defm LCMPXCHG16B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg16b", + X86cas16, i128mem>, REX_W; +} + +// Same as LCMPXCHG8B_SAVE_RBX but for the 16 Bytes variant. +let Defs = [RAX, RDX, RBX, EFLAGS], Uses = [RAX, RCX, RDX], + Predicates = [HasCmpxchg16b], SchedRW = [WriteALULd, WriteRMW], + isCodeGenOnly = 1, isPseudo = 1, Constraints = "$rbx_save = $dst", + usesCustomInserter = 1 in { +def LCMPXCHG16B_SAVE_RBX : + I<0, Pseudo, (outs GR64:$dst), + (ins i128mem:$ptr, GR64:$rbx_input, GR64:$rbx_save), + !strconcat("cmpxchg16b", "\t$ptr"), + [(set GR64:$dst, (X86cas16save_rbx addr:$ptr, GR64:$rbx_input, + GR64:$rbx_save))]>; +} + +defm LCMPXCHG : LCMPXCHG_BinOp<0xB0, 0xB1, MRMDestMem, "cmpxchg", X86cas>; + +// Atomic exchange and add +multiclass ATOMIC_LOAD_BINOP<bits<8> opc8, bits<8> opc, string mnemonic, + string frag> { + let Constraints = "$val = $dst", Defs = [EFLAGS], isCodeGenOnly = 1, + SchedRW = [WriteALULd, WriteRMW] in { + def NAME#8 : I<opc8, MRMSrcMem, (outs GR8:$dst), + (ins GR8:$val, i8mem:$ptr), + !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"), + [(set GR8:$dst, + (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))]>; + def NAME#16 : I<opc, MRMSrcMem, (outs GR16:$dst), + (ins GR16:$val, i16mem:$ptr), + !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"), + [(set + GR16:$dst, + (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))]>, + OpSize16; + def NAME#32 : I<opc, MRMSrcMem, (outs GR32:$dst), + (ins GR32:$val, i32mem:$ptr), + !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"), + [(set + GR32:$dst, + (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))]>, + OpSize32; + def NAME#64 : RI<opc, MRMSrcMem, (outs GR64:$dst), + (ins GR64:$val, i64mem:$ptr), + !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"), + [(set + GR64:$dst, + (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))]>; + } +} + +defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc1, "xadd", "atomic_load_add">, TB, LOCK; + +/* The following multiclass tries to make sure that in code like + * x.store (immediate op x.load(acquire), release) + * and + * x.store (register op x.load(acquire), release) + * an operation directly on memory is generated instead of wasting a register. + * It is not automatic as atomic_store/load are only lowered to MOV instructions + * extremely late to prevent them from being accidentally reordered in the backend + * (see below the RELEASE_MOV* / ACQUIRE_MOV* pseudo-instructions) + */ +multiclass RELEASE_BINOP_MI<SDNode op> { + def NAME#8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src), + "#BINOP "#NAME#"8mi PSEUDO!", + [(atomic_store_8 addr:$dst, (op + (atomic_load_8 addr:$dst), (i8 imm:$src)))]>; + def NAME#8mr : I<0, Pseudo, (outs), (ins i8mem:$dst, GR8:$src), + "#BINOP "#NAME#"8mr PSEUDO!", + [(atomic_store_8 addr:$dst, (op + (atomic_load_8 addr:$dst), GR8:$src))]>; + // NAME#16 is not generated as 16-bit arithmetic instructions are considered + // costly and avoided as far as possible by this backend anyway + def NAME#32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src), + "#BINOP "#NAME#"32mi PSEUDO!", + [(atomic_store_32 addr:$dst, (op + (atomic_load_32 addr:$dst), (i32 imm:$src)))]>; + def NAME#32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src), + "#BINOP "#NAME#"32mr PSEUDO!", + [(atomic_store_32 addr:$dst, (op + (atomic_load_32 addr:$dst), GR32:$src))]>; + def NAME#64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src), + "#BINOP "#NAME#"64mi32 PSEUDO!", + [(atomic_store_64 addr:$dst, (op + (atomic_load_64 addr:$dst), (i64immSExt32:$src)))]>; + def NAME#64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src), + "#BINOP "#NAME#"64mr PSEUDO!", + [(atomic_store_64 addr:$dst, (op + (atomic_load_64 addr:$dst), GR64:$src))]>; +} +let Defs = [EFLAGS], SchedRW = [WriteMicrocoded] in { + defm RELEASE_ADD : RELEASE_BINOP_MI<add>; + defm RELEASE_AND : RELEASE_BINOP_MI<and>; + defm RELEASE_OR : RELEASE_BINOP_MI<or>; + defm RELEASE_XOR : RELEASE_BINOP_MI<xor>; + // Note: we don't deal with sub, because substractions of constants are + // optimized into additions before this code can run. +} + +// Same as above, but for floating-point. +// FIXME: imm version. +// FIXME: Version that doesn't clobber $src, using AVX's VADDSS. +// FIXME: This could also handle SIMD operations with *ps and *pd instructions. +let usesCustomInserter = 1, SchedRW = [WriteMicrocoded] in { +multiclass RELEASE_FP_BINOP_MI<SDNode op> { + def NAME#32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, FR32:$src), + "#BINOP "#NAME#"32mr PSEUDO!", + [(atomic_store_32 addr:$dst, + (i32 (bitconvert (op + (f32 (bitconvert (i32 (atomic_load_32 addr:$dst)))), + FR32:$src))))]>, Requires<[HasSSE1]>; + def NAME#64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, FR64:$src), + "#BINOP "#NAME#"64mr PSEUDO!", + [(atomic_store_64 addr:$dst, + (i64 (bitconvert (op + (f64 (bitconvert (i64 (atomic_load_64 addr:$dst)))), + FR64:$src))))]>, Requires<[HasSSE2]>; +} +defm RELEASE_FADD : RELEASE_FP_BINOP_MI<fadd>; +// FIXME: Add fsub, fmul, fdiv, ... +} + +multiclass RELEASE_UNOP<dag dag8, dag dag16, dag dag32, dag dag64> { + def NAME#8m : I<0, Pseudo, (outs), (ins i8mem:$dst), + "#UNOP "#NAME#"8m PSEUDO!", + [(atomic_store_8 addr:$dst, dag8)]>; + def NAME#16m : I<0, Pseudo, (outs), (ins i16mem:$dst), + "#UNOP "#NAME#"16m PSEUDO!", + [(atomic_store_16 addr:$dst, dag16)]>; + def NAME#32m : I<0, Pseudo, (outs), (ins i32mem:$dst), + "#UNOP "#NAME#"32m PSEUDO!", + [(atomic_store_32 addr:$dst, dag32)]>; + def NAME#64m : I<0, Pseudo, (outs), (ins i64mem:$dst), + "#UNOP "#NAME#"64m PSEUDO!", + [(atomic_store_64 addr:$dst, dag64)]>; +} + +let Defs = [EFLAGS], Predicates = [UseIncDec], SchedRW = [WriteMicrocoded] in { + defm RELEASE_INC : RELEASE_UNOP< + (add (atomic_load_8 addr:$dst), (i8 1)), + (add (atomic_load_16 addr:$dst), (i16 1)), + (add (atomic_load_32 addr:$dst), (i32 1)), + (add (atomic_load_64 addr:$dst), (i64 1))>; + defm RELEASE_DEC : RELEASE_UNOP< + (add (atomic_load_8 addr:$dst), (i8 -1)), + (add (atomic_load_16 addr:$dst), (i16 -1)), + (add (atomic_load_32 addr:$dst), (i32 -1)), + (add (atomic_load_64 addr:$dst), (i64 -1))>; +} +/* +TODO: These don't work because the type inference of TableGen fails. +TODO: find a way to fix it. +let Defs = [EFLAGS] in { + defm RELEASE_NEG : RELEASE_UNOP< + (ineg (atomic_load_8 addr:$dst)), + (ineg (atomic_load_16 addr:$dst)), + (ineg (atomic_load_32 addr:$dst)), + (ineg (atomic_load_64 addr:$dst))>; +} +// NOT doesn't set flags. +defm RELEASE_NOT : RELEASE_UNOP< + (not (atomic_load_8 addr:$dst)), + (not (atomic_load_16 addr:$dst)), + (not (atomic_load_32 addr:$dst)), + (not (atomic_load_64 addr:$dst))>; +*/ + +let SchedRW = [WriteMicrocoded] in { +def RELEASE_MOV8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src), + "#RELEASE_MOV8mi PSEUDO!", + [(atomic_store_8 addr:$dst, (i8 imm:$src))]>; +def RELEASE_MOV16mi : I<0, Pseudo, (outs), (ins i16mem:$dst, i16imm:$src), + "#RELEASE_MOV16mi PSEUDO!", + [(atomic_store_16 addr:$dst, (i16 imm:$src))]>; +def RELEASE_MOV32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src), + "#RELEASE_MOV32mi PSEUDO!", + [(atomic_store_32 addr:$dst, (i32 imm:$src))]>; +def RELEASE_MOV64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src), + "#RELEASE_MOV64mi32 PSEUDO!", + [(atomic_store_64 addr:$dst, i64immSExt32:$src)]>; + +def RELEASE_MOV8mr : I<0, Pseudo, (outs), (ins i8mem :$dst, GR8 :$src), + "#RELEASE_MOV8mr PSEUDO!", + [(atomic_store_8 addr:$dst, GR8 :$src)]>; +def RELEASE_MOV16mr : I<0, Pseudo, (outs), (ins i16mem:$dst, GR16:$src), + "#RELEASE_MOV16mr PSEUDO!", + [(atomic_store_16 addr:$dst, GR16:$src)]>; +def RELEASE_MOV32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src), + "#RELEASE_MOV32mr PSEUDO!", + [(atomic_store_32 addr:$dst, GR32:$src)]>; +def RELEASE_MOV64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src), + "#RELEASE_MOV64mr PSEUDO!", + [(atomic_store_64 addr:$dst, GR64:$src)]>; + +def ACQUIRE_MOV8rm : I<0, Pseudo, (outs GR8 :$dst), (ins i8mem :$src), + "#ACQUIRE_MOV8rm PSEUDO!", + [(set GR8:$dst, (atomic_load_8 addr:$src))]>; +def ACQUIRE_MOV16rm : I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$src), + "#ACQUIRE_MOV16rm PSEUDO!", + [(set GR16:$dst, (atomic_load_16 addr:$src))]>; +def ACQUIRE_MOV32rm : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$src), + "#ACQUIRE_MOV32rm PSEUDO!", + [(set GR32:$dst, (atomic_load_32 addr:$src))]>; +def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src), + "#ACQUIRE_MOV64rm PSEUDO!", + [(set GR64:$dst, (atomic_load_64 addr:$src))]>; +} // SchedRW + +//===----------------------------------------------------------------------===// +// DAG Pattern Matching Rules +//===----------------------------------------------------------------------===// + +// Use AND/OR to store 0/-1 in memory when optimizing for minsize. This saves +// binary size compared to a regular MOV, but it introduces an unnecessary +// load, so is not suitable for regular or optsize functions. +let Predicates = [OptForMinSize] in { +def : Pat<(store (i16 0), addr:$dst), (AND16mi8 addr:$dst, 0)>; +def : Pat<(store (i32 0), addr:$dst), (AND32mi8 addr:$dst, 0)>; +def : Pat<(store (i64 0), addr:$dst), (AND64mi8 addr:$dst, 0)>; +def : Pat<(store (i16 -1), addr:$dst), (OR16mi8 addr:$dst, -1)>; +def : Pat<(store (i32 -1), addr:$dst), (OR32mi8 addr:$dst, -1)>; +def : Pat<(store (i64 -1), addr:$dst), (OR64mi8 addr:$dst, -1)>; +} + +// In kernel code model, we can get the address of a label +// into a register with 'movq'. FIXME: This is a hack, the 'imm' predicate of +// the MOV64ri32 should accept these. +def : Pat<(i64 (X86Wrapper tconstpool :$dst)), + (MOV64ri32 tconstpool :$dst)>, Requires<[KernelCode]>; +def : Pat<(i64 (X86Wrapper tjumptable :$dst)), + (MOV64ri32 tjumptable :$dst)>, Requires<[KernelCode]>; +def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)), + (MOV64ri32 tglobaladdr :$dst)>, Requires<[KernelCode]>; +def : Pat<(i64 (X86Wrapper texternalsym:$dst)), + (MOV64ri32 texternalsym:$dst)>, Requires<[KernelCode]>; +def : Pat<(i64 (X86Wrapper mcsym:$dst)), + (MOV64ri32 mcsym:$dst)>, Requires<[KernelCode]>; +def : Pat<(i64 (X86Wrapper tblockaddress:$dst)), + (MOV64ri32 tblockaddress:$dst)>, Requires<[KernelCode]>; + +// If we have small model and -static mode, it is safe to store global addresses +// directly as immediates. FIXME: This is really a hack, the 'imm' predicate +// for MOV64mi32 should handle this sort of thing. +def : Pat<(store (i64 (X86Wrapper tconstpool:$src)), addr:$dst), + (MOV64mi32 addr:$dst, tconstpool:$src)>, + Requires<[NearData, IsNotPIC]>; +def : Pat<(store (i64 (X86Wrapper tjumptable:$src)), addr:$dst), + (MOV64mi32 addr:$dst, tjumptable:$src)>, + Requires<[NearData, IsNotPIC]>; +def : Pat<(store (i64 (X86Wrapper tglobaladdr:$src)), addr:$dst), + (MOV64mi32 addr:$dst, tglobaladdr:$src)>, + Requires<[NearData, IsNotPIC]>; +def : Pat<(store (i64 (X86Wrapper texternalsym:$src)), addr:$dst), + (MOV64mi32 addr:$dst, texternalsym:$src)>, + Requires<[NearData, IsNotPIC]>; +def : Pat<(store (i64 (X86Wrapper mcsym:$src)), addr:$dst), + (MOV64mi32 addr:$dst, mcsym:$src)>, + Requires<[NearData, IsNotPIC]>; +def : Pat<(store (i64 (X86Wrapper tblockaddress:$src)), addr:$dst), + (MOV64mi32 addr:$dst, tblockaddress:$src)>, + Requires<[NearData, IsNotPIC]>; + +def : Pat<(i32 (X86RecoverFrameAlloc mcsym:$dst)), (MOV32ri mcsym:$dst)>; +def : Pat<(i64 (X86RecoverFrameAlloc mcsym:$dst)), (MOV64ri mcsym:$dst)>; + +// Calls + +// tls has some funny stuff here... +// This corresponds to movabs $foo@tpoff, %rax +def : Pat<(i64 (X86Wrapper tglobaltlsaddr :$dst)), + (MOV64ri32 tglobaltlsaddr :$dst)>; +// This corresponds to add $foo@tpoff, %rax +def : Pat<(add GR64:$src1, (X86Wrapper tglobaltlsaddr :$dst)), + (ADD64ri32 GR64:$src1, tglobaltlsaddr :$dst)>; + + +// Direct PC relative function call for small code model. 32-bit displacement +// sign extended to 64-bit. +def : Pat<(X86call (i64 tglobaladdr:$dst)), + (CALL64pcrel32 tglobaladdr:$dst)>; +def : Pat<(X86call (i64 texternalsym:$dst)), + (CALL64pcrel32 texternalsym:$dst)>; + +// Tailcall stuff. The TCRETURN instructions execute after the epilog, so they +// can never use callee-saved registers. That is the purpose of the GR64_TC +// register classes. +// +// The only volatile register that is never used by the calling convention is +// %r11. This happens when calling a vararg function with 6 arguments. +// +// Match an X86tcret that uses less than 7 volatile registers. +def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off), + (X86tcret node:$ptr, node:$off), [{ + // X86tcret args: (*chain, ptr, imm, regs..., glue) + unsigned NumRegs = 0; + for (unsigned i = 3, e = N->getNumOperands(); i != e; ++i) + if (isa<RegisterSDNode>(N->getOperand(i)) && ++NumRegs > 6) + return false; + return true; +}]>; + +def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off), + (TCRETURNri ptr_rc_tailcall:$dst, imm:$off)>, + Requires<[Not64BitMode, NotUseRetpoline]>; + +// FIXME: This is disabled for 32-bit PIC mode because the global base +// register which is part of the address mode may be assigned a +// callee-saved register. +def : Pat<(X86tcret (load addr:$dst), imm:$off), + (TCRETURNmi addr:$dst, imm:$off)>, + Requires<[Not64BitMode, IsNotPIC, NotUseRetpoline]>; + +def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off), + (TCRETURNdi tglobaladdr:$dst, imm:$off)>, + Requires<[NotLP64]>; + +def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off), + (TCRETURNdi texternalsym:$dst, imm:$off)>, + Requires<[NotLP64]>; + +def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off), + (TCRETURNri64 ptr_rc_tailcall:$dst, imm:$off)>, + Requires<[In64BitMode, NotUseRetpoline]>; + +// Don't fold loads into X86tcret requiring more than 6 regs. +// There wouldn't be enough scratch registers for base+index. +def : Pat<(X86tcret_6regs (load addr:$dst), imm:$off), + (TCRETURNmi64 addr:$dst, imm:$off)>, + Requires<[In64BitMode, NotUseRetpoline]>; + +def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off), + (RETPOLINE_TCRETURN64 ptr_rc_tailcall:$dst, imm:$off)>, + Requires<[In64BitMode, UseRetpoline]>; + +def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off), + (RETPOLINE_TCRETURN32 ptr_rc_tailcall:$dst, imm:$off)>, + Requires<[Not64BitMode, UseRetpoline]>; + +def : Pat<(X86tcret (i64 tglobaladdr:$dst), imm:$off), + (TCRETURNdi64 tglobaladdr:$dst, imm:$off)>, + Requires<[IsLP64]>; + +def : Pat<(X86tcret (i64 texternalsym:$dst), imm:$off), + (TCRETURNdi64 texternalsym:$dst, imm:$off)>, + Requires<[IsLP64]>; + +// Normal calls, with various flavors of addresses. +def : Pat<(X86call (i32 tglobaladdr:$dst)), + (CALLpcrel32 tglobaladdr:$dst)>; +def : Pat<(X86call (i32 texternalsym:$dst)), + (CALLpcrel32 texternalsym:$dst)>; +def : Pat<(X86call (i32 imm:$dst)), + (CALLpcrel32 imm:$dst)>, Requires<[CallImmAddr]>; + +// Comparisons. + +// TEST R,R is smaller than CMP R,0 +def : Pat<(X86cmp GR8:$src1, 0), + (TEST8rr GR8:$src1, GR8:$src1)>; +def : Pat<(X86cmp GR16:$src1, 0), + (TEST16rr GR16:$src1, GR16:$src1)>; +def : Pat<(X86cmp GR32:$src1, 0), + (TEST32rr GR32:$src1, GR32:$src1)>; +def : Pat<(X86cmp GR64:$src1, 0), + (TEST64rr GR64:$src1, GR64:$src1)>; + +// Conditional moves with folded loads with operands swapped and conditions +// inverted. +multiclass CMOVmr<PatLeaf InvertedCond, Instruction Inst16, Instruction Inst32, + Instruction Inst64> { + let Predicates = [HasCMov] in { + def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, InvertedCond, EFLAGS), + (Inst16 GR16:$src2, addr:$src1)>; + def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, InvertedCond, EFLAGS), + (Inst32 GR32:$src2, addr:$src1)>; + def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, InvertedCond, EFLAGS), + (Inst64 GR64:$src2, addr:$src1)>; + } +} + +defm : CMOVmr<X86_COND_B , CMOVAE16rm, CMOVAE32rm, CMOVAE64rm>; +defm : CMOVmr<X86_COND_AE, CMOVB16rm , CMOVB32rm , CMOVB64rm>; +defm : CMOVmr<X86_COND_E , CMOVNE16rm, CMOVNE32rm, CMOVNE64rm>; +defm : CMOVmr<X86_COND_NE, CMOVE16rm , CMOVE32rm , CMOVE64rm>; +defm : CMOVmr<X86_COND_BE, CMOVA16rm , CMOVA32rm , CMOVA64rm>; +defm : CMOVmr<X86_COND_A , CMOVBE16rm, CMOVBE32rm, CMOVBE64rm>; +defm : CMOVmr<X86_COND_L , CMOVGE16rm, CMOVGE32rm, CMOVGE64rm>; +defm : CMOVmr<X86_COND_GE, CMOVL16rm , CMOVL32rm , CMOVL64rm>; +defm : CMOVmr<X86_COND_LE, CMOVG16rm , CMOVG32rm , CMOVG64rm>; +defm : CMOVmr<X86_COND_G , CMOVLE16rm, CMOVLE32rm, CMOVLE64rm>; +defm : CMOVmr<X86_COND_P , CMOVNP16rm, CMOVNP32rm, CMOVNP64rm>; +defm : CMOVmr<X86_COND_NP, CMOVP16rm , CMOVP32rm , CMOVP64rm>; +defm : CMOVmr<X86_COND_S , CMOVNS16rm, CMOVNS32rm, CMOVNS64rm>; +defm : CMOVmr<X86_COND_NS, CMOVS16rm , CMOVS32rm , CMOVS64rm>; +defm : CMOVmr<X86_COND_O , CMOVNO16rm, CMOVNO32rm, CMOVNO64rm>; +defm : CMOVmr<X86_COND_NO, CMOVO16rm , CMOVO32rm , CMOVO64rm>; + +// zextload bool -> zextload byte +// i1 stored in one byte in zero-extended form. +// Upper bits cleanup should be executed before Store. +def : Pat<(zextloadi8i1 addr:$src), (MOV8rm addr:$src)>; +def : Pat<(zextloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>; +def : Pat<(zextloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>; +def : Pat<(zextloadi64i1 addr:$src), + (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>; + +// extload bool -> extload byte +// When extloading from 16-bit and smaller memory locations into 64-bit +// registers, use zero-extending loads so that the entire 64-bit register is +// defined, avoiding partial-register updates. + +def : Pat<(extloadi8i1 addr:$src), (MOV8rm addr:$src)>; +def : Pat<(extloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>; +def : Pat<(extloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>; +def : Pat<(extloadi16i8 addr:$src), (MOVZX16rm8 addr:$src)>; +def : Pat<(extloadi32i8 addr:$src), (MOVZX32rm8 addr:$src)>; +def : Pat<(extloadi32i16 addr:$src), (MOVZX32rm16 addr:$src)>; + +// For other extloads, use subregs, since the high contents of the register are +// defined after an extload. +def : Pat<(extloadi64i1 addr:$src), + (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>; +def : Pat<(extloadi64i8 addr:$src), + (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>; +def : Pat<(extloadi64i16 addr:$src), + (SUBREG_TO_REG (i64 0), (MOVZX32rm16 addr:$src), sub_32bit)>; +def : Pat<(extloadi64i32 addr:$src), + (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>; + +// anyext. Define these to do an explicit zero-extend to +// avoid partial-register updates. +def : Pat<(i16 (anyext GR8 :$src)), (EXTRACT_SUBREG + (MOVZX32rr8 GR8 :$src), sub_16bit)>; +def : Pat<(i32 (anyext GR8 :$src)), (MOVZX32rr8 GR8 :$src)>; + +// Except for i16 -> i32 since isel expect i16 ops to be promoted to i32. +def : Pat<(i32 (anyext GR16:$src)), + (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit)>; + +def : Pat<(i64 (anyext GR8 :$src)), + (SUBREG_TO_REG (i64 0), (MOVZX32rr8 GR8 :$src), sub_32bit)>; +def : Pat<(i64 (anyext GR16:$src)), + (SUBREG_TO_REG (i64 0), (MOVZX32rr16 GR16 :$src), sub_32bit)>; +def : Pat<(i64 (anyext GR32:$src)), + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, sub_32bit)>; + + +// Any instruction that defines a 32-bit result leaves the high half of the +// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may +// be copying from a truncate. Any other 32-bit operation will zero-extend +// up to 64 bits. AssertSext/AssertZext aren't saying anything about the upper +// 32 bits, they're probably just qualifying a CopyFromReg. +def def32 : PatLeaf<(i32 GR32:$src), [{ + return N->getOpcode() != ISD::TRUNCATE && + N->getOpcode() != TargetOpcode::EXTRACT_SUBREG && + N->getOpcode() != ISD::CopyFromReg && + N->getOpcode() != ISD::AssertSext && + N->getOpcode() != ISD::AssertZext; +}]>; + +// In the case of a 32-bit def that is known to implicitly zero-extend, +// we can use a SUBREG_TO_REG. +def : Pat<(i64 (zext def32:$src)), + (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>; + +//===----------------------------------------------------------------------===// +// Pattern match OR as ADD +//===----------------------------------------------------------------------===// + +// If safe, we prefer to pattern match OR as ADD at isel time. ADD can be +// 3-addressified into an LEA instruction to avoid copies. However, we also +// want to finally emit these instructions as an or at the end of the code +// generator to make the generated code easier to read. To do this, we select +// into "disjoint bits" pseudo ops. + +// Treat an 'or' node is as an 'add' if the or'ed bits are known to be zero. +def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{ + if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1))) + return CurDAG->MaskedValueIsZero(N->getOperand(0), CN->getAPIntValue()); + + KnownBits Known0; + CurDAG->computeKnownBits(N->getOperand(0), Known0, 0); + KnownBits Known1; + CurDAG->computeKnownBits(N->getOperand(1), Known1, 0); + return (~Known0.Zero & ~Known1.Zero) == 0; +}]>; + + +// (or x1, x2) -> (add x1, x2) if two operands are known not to share bits. +// Try this before the selecting to OR. +let AddedComplexity = 5, SchedRW = [WriteALU] in { + +let isConvertibleToThreeAddress = 1, + Constraints = "$src1 = $dst", Defs = [EFLAGS] in { +let isCommutable = 1 in { +def ADD16rr_DB : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), + "", // orw/addw REG, REG + [(set GR16:$dst, (or_is_add GR16:$src1, GR16:$src2))]>; +def ADD32rr_DB : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), + "", // orl/addl REG, REG + [(set GR32:$dst, (or_is_add GR32:$src1, GR32:$src2))]>; +def ADD64rr_DB : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), + "", // orq/addq REG, REG + [(set GR64:$dst, (or_is_add GR64:$src1, GR64:$src2))]>; +} // isCommutable + +// NOTE: These are order specific, we want the ri8 forms to be listed +// first so that they are slightly preferred to the ri forms. + +def ADD16ri8_DB : I<0, Pseudo, + (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), + "", // orw/addw REG, imm8 + [(set GR16:$dst,(or_is_add GR16:$src1,i16immSExt8:$src2))]>; +def ADD16ri_DB : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2), + "", // orw/addw REG, imm + [(set GR16:$dst, (or_is_add GR16:$src1, imm:$src2))]>; + +def ADD32ri8_DB : I<0, Pseudo, + (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2), + "", // orl/addl REG, imm8 + [(set GR32:$dst,(or_is_add GR32:$src1,i32immSExt8:$src2))]>; +def ADD32ri_DB : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2), + "", // orl/addl REG, imm + [(set GR32:$dst, (or_is_add GR32:$src1, imm:$src2))]>; + + +def ADD64ri8_DB : I<0, Pseudo, + (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2), + "", // orq/addq REG, imm8 + [(set GR64:$dst, (or_is_add GR64:$src1, + i64immSExt8:$src2))]>; +def ADD64ri32_DB : I<0, Pseudo, + (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2), + "", // orq/addq REG, imm + [(set GR64:$dst, (or_is_add GR64:$src1, + i64immSExt32:$src2))]>; +} +} // AddedComplexity, SchedRW + +//===----------------------------------------------------------------------===// +// Pattern match SUB as XOR +//===----------------------------------------------------------------------===// + +// An immediate in the LHS of a subtract can't be encoded in the instruction. +// If there is no possibility of a borrow we can use an XOR instead of a SUB +// to enable the immediate to be folded. +// TODO: Move this to a DAG combine? + +def sub_is_xor : PatFrag<(ops node:$lhs, node:$rhs), (sub node:$lhs, node:$rhs),[{ + if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(0))) { + KnownBits Known; + CurDAG->computeKnownBits(N->getOperand(1), Known); + + // If all possible ones in the RHS are set in the LHS then there can't be + // a borrow and we can use xor. + return (~Known.Zero).isSubsetOf(CN->getAPIntValue()); + } + + return false; +}]>; + +let AddedComplexity = 5 in { +def : Pat<(sub_is_xor imm:$src2, GR8:$src1), + (XOR8ri GR8:$src1, imm:$src2)>; +def : Pat<(sub_is_xor i16immSExt8:$src2, GR16:$src1), + (XOR16ri8 GR16:$src1, i16immSExt8:$src2)>; +def : Pat<(sub_is_xor imm:$src2, GR16:$src1), + (XOR16ri GR16:$src1, imm:$src2)>; +def : Pat<(sub_is_xor i32immSExt8:$src2, GR32:$src1), + (XOR32ri8 GR32:$src1, i32immSExt8:$src2)>; +def : Pat<(sub_is_xor imm:$src2, GR32:$src1), + (XOR32ri GR32:$src1, imm:$src2)>; +def : Pat<(sub_is_xor i64immSExt8:$src2, GR64:$src1), + (XOR64ri8 GR64:$src1, i64immSExt8:$src2)>; +def : Pat<(sub_is_xor i64immSExt32:$src2, GR64:$src1), + (XOR64ri32 GR64:$src1, i64immSExt32:$src2)>; +} + +//===----------------------------------------------------------------------===// +// Some peepholes +//===----------------------------------------------------------------------===// + +// Odd encoding trick: -128 fits into an 8-bit immediate field while +// +128 doesn't, so in this special case use a sub instead of an add. +def : Pat<(add GR16:$src1, 128), + (SUB16ri8 GR16:$src1, -128)>; +def : Pat<(store (add (loadi16 addr:$dst), 128), addr:$dst), + (SUB16mi8 addr:$dst, -128)>; + +def : Pat<(add GR32:$src1, 128), + (SUB32ri8 GR32:$src1, -128)>; +def : Pat<(store (add (loadi32 addr:$dst), 128), addr:$dst), + (SUB32mi8 addr:$dst, -128)>; + +def : Pat<(add GR64:$src1, 128), + (SUB64ri8 GR64:$src1, -128)>; +def : Pat<(store (add (loadi64 addr:$dst), 128), addr:$dst), + (SUB64mi8 addr:$dst, -128)>; + +// The same trick applies for 32-bit immediate fields in 64-bit +// instructions. +def : Pat<(add GR64:$src1, 0x0000000080000000), + (SUB64ri32 GR64:$src1, 0xffffffff80000000)>; +def : Pat<(store (add (loadi64 addr:$dst), 0x0000000080000000), addr:$dst), + (SUB64mi32 addr:$dst, 0xffffffff80000000)>; + +// To avoid needing to materialize an immediate in a register, use a 32-bit and +// with implicit zero-extension instead of a 64-bit and if the immediate has at +// least 32 bits of leading zeros. If in addition the last 32 bits can be +// represented with a sign extension of a 8 bit constant, use that. +// This can also reduce instruction size by eliminating the need for the REX +// prefix. + +// AddedComplexity is needed to give priority over i64immSExt8 and i64immSExt32. +let AddedComplexity = 1 in { +def : Pat<(and GR64:$src, i64immZExt32SExt8:$imm), + (SUBREG_TO_REG + (i64 0), + (AND32ri8 + (EXTRACT_SUBREG GR64:$src, sub_32bit), + (i32 (GetLo8XForm imm:$imm))), + sub_32bit)>; + +def : Pat<(and GR64:$src, i64immZExt32:$imm), + (SUBREG_TO_REG + (i64 0), + (AND32ri + (EXTRACT_SUBREG GR64:$src, sub_32bit), + (i32 (GetLo32XForm imm:$imm))), + sub_32bit)>; +} // AddedComplexity = 1 + + +// AddedComplexity is needed due to the increased complexity on the +// i64immZExt32SExt8 and i64immZExt32 patterns above. Applying this to all +// the MOVZX patterns keeps thems together in DAGIsel tables. +let AddedComplexity = 1 in { +// r & (2^16-1) ==> movz +def : Pat<(and GR32:$src1, 0xffff), + (MOVZX32rr16 (EXTRACT_SUBREG GR32:$src1, sub_16bit))>; +// r & (2^8-1) ==> movz +def : Pat<(and GR32:$src1, 0xff), + (MOVZX32rr8 (EXTRACT_SUBREG GR32:$src1, sub_8bit))>; +// r & (2^8-1) ==> movz +def : Pat<(and GR16:$src1, 0xff), + (EXTRACT_SUBREG (MOVZX32rr8 (EXTRACT_SUBREG GR16:$src1, sub_8bit)), + sub_16bit)>; + +// r & (2^32-1) ==> movz +def : Pat<(and GR64:$src, 0x00000000FFFFFFFF), + (SUBREG_TO_REG (i64 0), + (MOV32rr (EXTRACT_SUBREG GR64:$src, sub_32bit)), + sub_32bit)>; +// r & (2^16-1) ==> movz +def : Pat<(and GR64:$src, 0xffff), + (SUBREG_TO_REG (i64 0), + (MOVZX32rr16 (i16 (EXTRACT_SUBREG GR64:$src, sub_16bit))), + sub_32bit)>; +// r & (2^8-1) ==> movz +def : Pat<(and GR64:$src, 0xff), + (SUBREG_TO_REG (i64 0), + (MOVZX32rr8 (i8 (EXTRACT_SUBREG GR64:$src, sub_8bit))), + sub_32bit)>; +} // AddedComplexity = 1 + + +// Try to use BTS/BTR/BTC for single bit operations on the upper 32-bits. + +def BTRXForm : SDNodeXForm<imm, [{ + // Transformation function: Find the lowest 0. + return getI64Imm((uint8_t)N->getAPIntValue().countTrailingOnes(), SDLoc(N)); +}]>; + +def BTCBTSXForm : SDNodeXForm<imm, [{ + // Transformation function: Find the lowest 1. + return getI64Imm((uint8_t)N->getAPIntValue().countTrailingZeros(), SDLoc(N)); +}]>; + +def BTRMask64 : ImmLeaf<i64, [{ + return !isUInt<32>(Imm) && !isInt<32>(Imm) && isPowerOf2_64(~Imm); +}]>; + +def BTCBTSMask64 : ImmLeaf<i64, [{ + return !isInt<32>(Imm) && isPowerOf2_64(Imm); +}]>; + +// For now only do this for optsize. +let AddedComplexity = 1, Predicates=[OptForSize] in { + def : Pat<(and GR64:$src1, BTRMask64:$mask), + (BTR64ri8 GR64:$src1, (BTRXForm imm:$mask))>; + def : Pat<(or GR64:$src1, BTCBTSMask64:$mask), + (BTS64ri8 GR64:$src1, (BTCBTSXForm imm:$mask))>; + def : Pat<(xor GR64:$src1, BTCBTSMask64:$mask), + (BTC64ri8 GR64:$src1, (BTCBTSXForm imm:$mask))>; +} + + +// sext_inreg patterns +def : Pat<(sext_inreg GR32:$src, i16), + (MOVSX32rr16 (EXTRACT_SUBREG GR32:$src, sub_16bit))>; +def : Pat<(sext_inreg GR32:$src, i8), + (MOVSX32rr8 (EXTRACT_SUBREG GR32:$src, sub_8bit))>; + +def : Pat<(sext_inreg GR16:$src, i8), + (EXTRACT_SUBREG (MOVSX32rr8 (EXTRACT_SUBREG GR16:$src, sub_8bit)), + sub_16bit)>; + +def : Pat<(sext_inreg GR64:$src, i32), + (MOVSX64rr32 (EXTRACT_SUBREG GR64:$src, sub_32bit))>; +def : Pat<(sext_inreg GR64:$src, i16), + (MOVSX64rr16 (EXTRACT_SUBREG GR64:$src, sub_16bit))>; +def : Pat<(sext_inreg GR64:$src, i8), + (MOVSX64rr8 (EXTRACT_SUBREG GR64:$src, sub_8bit))>; + +// sext, sext_load, zext, zext_load +def: Pat<(i16 (sext GR8:$src)), + (EXTRACT_SUBREG (MOVSX32rr8 GR8:$src), sub_16bit)>; +def: Pat<(sextloadi16i8 addr:$src), + (EXTRACT_SUBREG (MOVSX32rm8 addr:$src), sub_16bit)>; +def: Pat<(i16 (zext GR8:$src)), + (EXTRACT_SUBREG (MOVZX32rr8 GR8:$src), sub_16bit)>; +def: Pat<(zextloadi16i8 addr:$src), + (EXTRACT_SUBREG (MOVZX32rm8 addr:$src), sub_16bit)>; + +// trunc patterns +def : Pat<(i16 (trunc GR32:$src)), + (EXTRACT_SUBREG GR32:$src, sub_16bit)>; +def : Pat<(i8 (trunc GR32:$src)), + (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)), + sub_8bit)>, + Requires<[Not64BitMode]>; +def : Pat<(i8 (trunc GR16:$src)), + (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), + sub_8bit)>, + Requires<[Not64BitMode]>; +def : Pat<(i32 (trunc GR64:$src)), + (EXTRACT_SUBREG GR64:$src, sub_32bit)>; +def : Pat<(i16 (trunc GR64:$src)), + (EXTRACT_SUBREG GR64:$src, sub_16bit)>; +def : Pat<(i8 (trunc GR64:$src)), + (EXTRACT_SUBREG GR64:$src, sub_8bit)>; +def : Pat<(i8 (trunc GR32:$src)), + (EXTRACT_SUBREG GR32:$src, sub_8bit)>, + Requires<[In64BitMode]>; +def : Pat<(i8 (trunc GR16:$src)), + (EXTRACT_SUBREG GR16:$src, sub_8bit)>, + Requires<[In64BitMode]>; + +def immff00_ffff : ImmLeaf<i32, [{ + return Imm >= 0xff00 && Imm <= 0xffff; +}]>; + +// h-register tricks +def : Pat<(i8 (trunc (srl_su GR16:$src, (i8 8)))), + (EXTRACT_SUBREG GR16:$src, sub_8bit_hi)>, + Requires<[Not64BitMode]>; +def : Pat<(i8 (trunc (srl_su (i32 (anyext GR16:$src)), (i8 8)))), + (EXTRACT_SUBREG GR16:$src, sub_8bit_hi)>, + Requires<[Not64BitMode]>; +def : Pat<(i8 (trunc (srl_su GR32:$src, (i8 8)))), + (EXTRACT_SUBREG GR32:$src, sub_8bit_hi)>, + Requires<[Not64BitMode]>; +def : Pat<(srl GR16:$src, (i8 8)), + (EXTRACT_SUBREG + (MOVZX32rr8_NOREX (EXTRACT_SUBREG GR16:$src, sub_8bit_hi)), + sub_16bit)>; +def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))), + (MOVZX32rr8_NOREX (EXTRACT_SUBREG GR16:$src, sub_8bit_hi))>; +def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))), + (MOVZX32rr8_NOREX (EXTRACT_SUBREG GR16:$src, sub_8bit_hi))>; +def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)), + (MOVZX32rr8_NOREX (EXTRACT_SUBREG GR32:$src, sub_8bit_hi))>; +def : Pat<(srl (and_su GR32:$src, immff00_ffff), (i8 8)), + (MOVZX32rr8_NOREX (EXTRACT_SUBREG GR32:$src, sub_8bit_hi))>; + +// h-register tricks. +// For now, be conservative on x86-64 and use an h-register extract only if the +// value is immediately zero-extended or stored, which are somewhat common +// cases. This uses a bunch of code to prevent a register requiring a REX prefix +// from being allocated in the same instruction as the h register, as there's +// currently no way to describe this requirement to the register allocator. + +// h-register extract and zero-extend. +def : Pat<(and (srl_su GR64:$src, (i8 8)), (i64 255)), + (SUBREG_TO_REG + (i64 0), + (MOVZX32rr8_NOREX + (EXTRACT_SUBREG GR64:$src, sub_8bit_hi)), + sub_32bit)>; +def : Pat<(i64 (zext (srl_su GR16:$src, (i8 8)))), + (SUBREG_TO_REG + (i64 0), + (MOVZX32rr8_NOREX + (EXTRACT_SUBREG GR16:$src, sub_8bit_hi)), + sub_32bit)>; +def : Pat<(i64 (anyext (srl_su GR16:$src, (i8 8)))), + (SUBREG_TO_REG + (i64 0), + (MOVZX32rr8_NOREX + (EXTRACT_SUBREG GR16:$src, sub_8bit_hi)), + sub_32bit)>; + +// h-register extract and store. +def : Pat<(store (i8 (trunc_su (srl_su GR64:$src, (i8 8)))), addr:$dst), + (MOV8mr_NOREX + addr:$dst, + (EXTRACT_SUBREG GR64:$src, sub_8bit_hi))>; +def : Pat<(store (i8 (trunc_su (srl_su GR32:$src, (i8 8)))), addr:$dst), + (MOV8mr_NOREX + addr:$dst, + (EXTRACT_SUBREG GR32:$src, sub_8bit_hi))>, + Requires<[In64BitMode]>; +def : Pat<(store (i8 (trunc_su (srl_su GR16:$src, (i8 8)))), addr:$dst), + (MOV8mr_NOREX + addr:$dst, + (EXTRACT_SUBREG GR16:$src, sub_8bit_hi))>, + Requires<[In64BitMode]>; + + +// (shl x, 1) ==> (add x, x) +// Note that if x is undef (immediate or otherwise), we could theoretically +// end up with the two uses of x getting different values, producing a result +// where the least significant bit is not 0. However, the probability of this +// happening is considered low enough that this is officially not a +// "real problem". +def : Pat<(shl GR8 :$src1, (i8 1)), (ADD8rr GR8 :$src1, GR8 :$src1)>; +def : Pat<(shl GR16:$src1, (i8 1)), (ADD16rr GR16:$src1, GR16:$src1)>; +def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>; +def : Pat<(shl GR64:$src1, (i8 1)), (ADD64rr GR64:$src1, GR64:$src1)>; + +// Helper imms to check if a mask doesn't change significant shift/rotate bits. +def immShift8 : ImmLeaf<i8, [{ + return countTrailingOnes<uint64_t>(Imm) >= 3; +}]>; +def immShift16 : ImmLeaf<i8, [{ + return countTrailingOnes<uint64_t>(Imm) >= 4; +}]>; +def immShift32 : ImmLeaf<i8, [{ + return countTrailingOnes<uint64_t>(Imm) >= 5; +}]>; +def immShift64 : ImmLeaf<i8, [{ + return countTrailingOnes<uint64_t>(Imm) >= 6; +}]>; + +// Shift amount is implicitly masked. +multiclass MaskedShiftAmountPats<SDNode frag, string name> { + // (shift x (and y, 31)) ==> (shift x, y) + def : Pat<(frag GR8:$src1, (and CL, immShift32)), + (!cast<Instruction>(name # "8rCL") GR8:$src1)>; + def : Pat<(frag GR16:$src1, (and CL, immShift32)), + (!cast<Instruction>(name # "16rCL") GR16:$src1)>; + def : Pat<(frag GR32:$src1, (and CL, immShift32)), + (!cast<Instruction>(name # "32rCL") GR32:$src1)>; + def : Pat<(store (frag (loadi8 addr:$dst), (and CL, immShift32)), addr:$dst), + (!cast<Instruction>(name # "8mCL") addr:$dst)>; + def : Pat<(store (frag (loadi16 addr:$dst), (and CL, immShift32)), addr:$dst), + (!cast<Instruction>(name # "16mCL") addr:$dst)>; + def : Pat<(store (frag (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst), + (!cast<Instruction>(name # "32mCL") addr:$dst)>; + + // (shift x (and y, 63)) ==> (shift x, y) + def : Pat<(frag GR64:$src1, (and CL, immShift64)), + (!cast<Instruction>(name # "64rCL") GR64:$src1)>; + def : Pat<(store (frag (loadi64 addr:$dst), (and CL, immShift64)), addr:$dst), + (!cast<Instruction>(name # "64mCL") addr:$dst)>; +} + +defm : MaskedShiftAmountPats<shl, "SHL">; +defm : MaskedShiftAmountPats<srl, "SHR">; +defm : MaskedShiftAmountPats<sra, "SAR">; + +// ROL/ROR instructions allow a stronger mask optimization than shift for 8- and +// 16-bit. We can remove a mask of any (bitwidth - 1) on the rotation amount +// because over-rotating produces the same result. This is noted in the Intel +// docs with: "tempCOUNT <- (COUNT & COUNTMASK) MOD SIZE". Masking the rotation +// amount could affect EFLAGS results, but that does not matter because we are +// not tracking flags for these nodes. +multiclass MaskedRotateAmountPats<SDNode frag, string name> { + // (rot x (and y, BitWidth - 1)) ==> (rot x, y) + def : Pat<(frag GR8:$src1, (and CL, immShift8)), + (!cast<Instruction>(name # "8rCL") GR8:$src1)>; + def : Pat<(frag GR16:$src1, (and CL, immShift16)), + (!cast<Instruction>(name # "16rCL") GR16:$src1)>; + def : Pat<(frag GR32:$src1, (and CL, immShift32)), + (!cast<Instruction>(name # "32rCL") GR32:$src1)>; + def : Pat<(store (frag (loadi8 addr:$dst), (and CL, immShift8)), addr:$dst), + (!cast<Instruction>(name # "8mCL") addr:$dst)>; + def : Pat<(store (frag (loadi16 addr:$dst), (and CL, immShift16)), addr:$dst), + (!cast<Instruction>(name # "16mCL") addr:$dst)>; + def : Pat<(store (frag (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst), + (!cast<Instruction>(name # "32mCL") addr:$dst)>; + + // (rot x (and y, 63)) ==> (rot x, y) + def : Pat<(frag GR64:$src1, (and CL, immShift64)), + (!cast<Instruction>(name # "64rCL") GR64:$src1)>; + def : Pat<(store (frag (loadi64 addr:$dst), (and CL, immShift64)), addr:$dst), + (!cast<Instruction>(name # "64mCL") addr:$dst)>; +} + + +defm : MaskedRotateAmountPats<rotl, "ROL">; +defm : MaskedRotateAmountPats<rotr, "ROR">; + +// Double shift amount is implicitly masked. +multiclass MaskedDoubleShiftAmountPats<SDNode frag, string name> { + // (shift x (and y, 31)) ==> (shift x, y) + def : Pat<(frag GR16:$src1, GR16:$src2, (and CL, immShift32)), + (!cast<Instruction>(name # "16rrCL") GR16:$src1, GR16:$src2)>; + def : Pat<(frag GR32:$src1, GR32:$src2, (and CL, immShift32)), + (!cast<Instruction>(name # "32rrCL") GR32:$src1, GR32:$src2)>; + + // (shift x (and y, 63)) ==> (shift x, y) + def : Pat<(frag GR64:$src1, GR64:$src2, (and CL, immShift64)), + (!cast<Instruction>(name # "64rrCL") GR64:$src1, GR64:$src2)>; +} + +defm : MaskedDoubleShiftAmountPats<X86shld, "SHLD">; +defm : MaskedDoubleShiftAmountPats<X86shrd, "SHRD">; + +let Predicates = [HasBMI2] in { + let AddedComplexity = 1 in { + def : Pat<(sra GR32:$src1, (and GR8:$src2, immShift32)), + (SARX32rr GR32:$src1, + (INSERT_SUBREG + (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + def : Pat<(sra GR64:$src1, (and GR8:$src2, immShift64)), + (SARX64rr GR64:$src1, + (INSERT_SUBREG + (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + + def : Pat<(srl GR32:$src1, (and GR8:$src2, immShift32)), + (SHRX32rr GR32:$src1, + (INSERT_SUBREG + (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + def : Pat<(srl GR64:$src1, (and GR8:$src2, immShift64)), + (SHRX64rr GR64:$src1, + (INSERT_SUBREG + (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + + def : Pat<(shl GR32:$src1, (and GR8:$src2, immShift32)), + (SHLX32rr GR32:$src1, + (INSERT_SUBREG + (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + def : Pat<(shl GR64:$src1, (and GR8:$src2, immShift64)), + (SHLX64rr GR64:$src1, + (INSERT_SUBREG + (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + } + + def : Pat<(sra (loadi32 addr:$src1), (and GR8:$src2, immShift32)), + (SARX32rm addr:$src1, + (INSERT_SUBREG + (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + def : Pat<(sra (loadi64 addr:$src1), (and GR8:$src2, immShift64)), + (SARX64rm addr:$src1, + (INSERT_SUBREG + (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + + def : Pat<(srl (loadi32 addr:$src1), (and GR8:$src2, immShift32)), + (SHRX32rm addr:$src1, + (INSERT_SUBREG + (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + def : Pat<(srl (loadi64 addr:$src1), (and GR8:$src2, immShift64)), + (SHRX64rm addr:$src1, + (INSERT_SUBREG + (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + + def : Pat<(shl (loadi32 addr:$src1), (and GR8:$src2, immShift32)), + (SHLX32rm addr:$src1, + (INSERT_SUBREG + (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + def : Pat<(shl (loadi64 addr:$src1), (and GR8:$src2, immShift64)), + (SHLX64rm addr:$src1, + (INSERT_SUBREG + (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; +} + +// Use BTR/BTS/BTC for clearing/setting/toggling a bit in a variable location. +multiclass one_bit_patterns<RegisterClass RC, ValueType VT, Instruction BTR, + Instruction BTS, Instruction BTC, + ImmLeaf ImmShift> { + def : Pat<(and RC:$src1, (rotl -2, GR8:$src2)), + (BTR RC:$src1, + (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + def : Pat<(or RC:$src1, (shl 1, GR8:$src2)), + (BTS RC:$src1, + (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + def : Pat<(xor RC:$src1, (shl 1, GR8:$src2)), + (BTC RC:$src1, + (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + + // Similar to above, but removing unneeded masking of the shift amount. + def : Pat<(and RC:$src1, (rotl -2, (and GR8:$src2, ImmShift))), + (BTR RC:$src1, + (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + def : Pat<(or RC:$src1, (shl 1, (and GR8:$src2, ImmShift))), + (BTS RC:$src1, + (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + def : Pat<(xor RC:$src1, (shl 1, (and GR8:$src2, ImmShift))), + (BTC RC:$src1, + (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; +} + +defm : one_bit_patterns<GR16, i16, BTR16rr, BTS16rr, BTC16rr, immShift16>; +defm : one_bit_patterns<GR32, i32, BTR32rr, BTS32rr, BTC32rr, immShift32>; +defm : one_bit_patterns<GR64, i64, BTR64rr, BTS64rr, BTC64rr, immShift64>; + + +// (anyext (setcc_carry)) -> (setcc_carry) +def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C16r)>; +def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C32r)>; +def : Pat<(i32 (anyext (i16 (X86setcc_c X86_COND_B, EFLAGS)))), + (SETB_C32r)>; + +//===----------------------------------------------------------------------===// +// EFLAGS-defining Patterns +//===----------------------------------------------------------------------===// + +// add reg, reg +def : Pat<(add GR8 :$src1, GR8 :$src2), (ADD8rr GR8 :$src1, GR8 :$src2)>; +def : Pat<(add GR16:$src1, GR16:$src2), (ADD16rr GR16:$src1, GR16:$src2)>; +def : Pat<(add GR32:$src1, GR32:$src2), (ADD32rr GR32:$src1, GR32:$src2)>; +def : Pat<(add GR64:$src1, GR64:$src2), (ADD64rr GR64:$src1, GR64:$src2)>; + +// add reg, mem +def : Pat<(add GR8:$src1, (loadi8 addr:$src2)), + (ADD8rm GR8:$src1, addr:$src2)>; +def : Pat<(add GR16:$src1, (loadi16 addr:$src2)), + (ADD16rm GR16:$src1, addr:$src2)>; +def : Pat<(add GR32:$src1, (loadi32 addr:$src2)), + (ADD32rm GR32:$src1, addr:$src2)>; +def : Pat<(add GR64:$src1, (loadi64 addr:$src2)), + (ADD64rm GR64:$src1, addr:$src2)>; + +// add reg, imm +def : Pat<(add GR8 :$src1, imm:$src2), (ADD8ri GR8:$src1 , imm:$src2)>; +def : Pat<(add GR16:$src1, imm:$src2), (ADD16ri GR16:$src1, imm:$src2)>; +def : Pat<(add GR32:$src1, imm:$src2), (ADD32ri GR32:$src1, imm:$src2)>; +def : Pat<(add GR16:$src1, i16immSExt8:$src2), + (ADD16ri8 GR16:$src1, i16immSExt8:$src2)>; +def : Pat<(add GR32:$src1, i32immSExt8:$src2), + (ADD32ri8 GR32:$src1, i32immSExt8:$src2)>; +def : Pat<(add GR64:$src1, i64immSExt8:$src2), + (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>; +def : Pat<(add GR64:$src1, i64immSExt32:$src2), + (ADD64ri32 GR64:$src1, i64immSExt32:$src2)>; + +// sub reg, reg +def : Pat<(sub GR8 :$src1, GR8 :$src2), (SUB8rr GR8 :$src1, GR8 :$src2)>; +def : Pat<(sub GR16:$src1, GR16:$src2), (SUB16rr GR16:$src1, GR16:$src2)>; +def : Pat<(sub GR32:$src1, GR32:$src2), (SUB32rr GR32:$src1, GR32:$src2)>; +def : Pat<(sub GR64:$src1, GR64:$src2), (SUB64rr GR64:$src1, GR64:$src2)>; + +// sub reg, mem +def : Pat<(sub GR8:$src1, (loadi8 addr:$src2)), + (SUB8rm GR8:$src1, addr:$src2)>; +def : Pat<(sub GR16:$src1, (loadi16 addr:$src2)), + (SUB16rm GR16:$src1, addr:$src2)>; +def : Pat<(sub GR32:$src1, (loadi32 addr:$src2)), + (SUB32rm GR32:$src1, addr:$src2)>; +def : Pat<(sub GR64:$src1, (loadi64 addr:$src2)), + (SUB64rm GR64:$src1, addr:$src2)>; + +// sub reg, imm +def : Pat<(sub GR8:$src1, imm:$src2), + (SUB8ri GR8:$src1, imm:$src2)>; +def : Pat<(sub GR16:$src1, imm:$src2), + (SUB16ri GR16:$src1, imm:$src2)>; +def : Pat<(sub GR32:$src1, imm:$src2), + (SUB32ri GR32:$src1, imm:$src2)>; +def : Pat<(sub GR16:$src1, i16immSExt8:$src2), + (SUB16ri8 GR16:$src1, i16immSExt8:$src2)>; +def : Pat<(sub GR32:$src1, i32immSExt8:$src2), + (SUB32ri8 GR32:$src1, i32immSExt8:$src2)>; +def : Pat<(sub GR64:$src1, i64immSExt8:$src2), + (SUB64ri8 GR64:$src1, i64immSExt8:$src2)>; +def : Pat<(sub GR64:$src1, i64immSExt32:$src2), + (SUB64ri32 GR64:$src1, i64immSExt32:$src2)>; + +// sub 0, reg +def : Pat<(X86sub_flag 0, GR8 :$src), (NEG8r GR8 :$src)>; +def : Pat<(X86sub_flag 0, GR16:$src), (NEG16r GR16:$src)>; +def : Pat<(X86sub_flag 0, GR32:$src), (NEG32r GR32:$src)>; +def : Pat<(X86sub_flag 0, GR64:$src), (NEG64r GR64:$src)>; + +// sub reg, relocImm +def : Pat<(X86sub_flag GR64:$src1, i64relocImmSExt8_su:$src2), + (SUB64ri8 GR64:$src1, i64relocImmSExt8_su:$src2)>; +def : Pat<(X86sub_flag GR64:$src1, i64relocImmSExt32_su:$src2), + (SUB64ri32 GR64:$src1, i64relocImmSExt32_su:$src2)>; + +// mul reg, reg +def : Pat<(mul GR16:$src1, GR16:$src2), + (IMUL16rr GR16:$src1, GR16:$src2)>; +def : Pat<(mul GR32:$src1, GR32:$src2), + (IMUL32rr GR32:$src1, GR32:$src2)>; +def : Pat<(mul GR64:$src1, GR64:$src2), + (IMUL64rr GR64:$src1, GR64:$src2)>; + +// mul reg, mem +def : Pat<(mul GR16:$src1, (loadi16 addr:$src2)), + (IMUL16rm GR16:$src1, addr:$src2)>; +def : Pat<(mul GR32:$src1, (loadi32 addr:$src2)), + (IMUL32rm GR32:$src1, addr:$src2)>; +def : Pat<(mul GR64:$src1, (loadi64 addr:$src2)), + (IMUL64rm GR64:$src1, addr:$src2)>; + +// mul reg, imm +def : Pat<(mul GR16:$src1, imm:$src2), + (IMUL16rri GR16:$src1, imm:$src2)>; +def : Pat<(mul GR32:$src1, imm:$src2), + (IMUL32rri GR32:$src1, imm:$src2)>; +def : Pat<(mul GR16:$src1, i16immSExt8:$src2), + (IMUL16rri8 GR16:$src1, i16immSExt8:$src2)>; +def : Pat<(mul GR32:$src1, i32immSExt8:$src2), + (IMUL32rri8 GR32:$src1, i32immSExt8:$src2)>; +def : Pat<(mul GR64:$src1, i64immSExt8:$src2), + (IMUL64rri8 GR64:$src1, i64immSExt8:$src2)>; +def : Pat<(mul GR64:$src1, i64immSExt32:$src2), + (IMUL64rri32 GR64:$src1, i64immSExt32:$src2)>; + +// reg = mul mem, imm +def : Pat<(mul (loadi16 addr:$src1), imm:$src2), + (IMUL16rmi addr:$src1, imm:$src2)>; +def : Pat<(mul (loadi32 addr:$src1), imm:$src2), + (IMUL32rmi addr:$src1, imm:$src2)>; +def : Pat<(mul (loadi16 addr:$src1), i16immSExt8:$src2), + (IMUL16rmi8 addr:$src1, i16immSExt8:$src2)>; +def : Pat<(mul (loadi32 addr:$src1), i32immSExt8:$src2), + (IMUL32rmi8 addr:$src1, i32immSExt8:$src2)>; +def : Pat<(mul (loadi64 addr:$src1), i64immSExt8:$src2), + (IMUL64rmi8 addr:$src1, i64immSExt8:$src2)>; +def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2), + (IMUL64rmi32 addr:$src1, i64immSExt32:$src2)>; + +// Increment/Decrement reg. +// Do not make INC/DEC if it is slow +let Predicates = [UseIncDec] in { + def : Pat<(add GR8:$src, 1), (INC8r GR8:$src)>; + def : Pat<(add GR16:$src, 1), (INC16r GR16:$src)>; + def : Pat<(add GR32:$src, 1), (INC32r GR32:$src)>; + def : Pat<(add GR64:$src, 1), (INC64r GR64:$src)>; + def : Pat<(add GR8:$src, -1), (DEC8r GR8:$src)>; + def : Pat<(add GR16:$src, -1), (DEC16r GR16:$src)>; + def : Pat<(add GR32:$src, -1), (DEC32r GR32:$src)>; + def : Pat<(add GR64:$src, -1), (DEC64r GR64:$src)>; +} + +// or reg/reg. +def : Pat<(or GR8 :$src1, GR8 :$src2), (OR8rr GR8 :$src1, GR8 :$src2)>; +def : Pat<(or GR16:$src1, GR16:$src2), (OR16rr GR16:$src1, GR16:$src2)>; +def : Pat<(or GR32:$src1, GR32:$src2), (OR32rr GR32:$src1, GR32:$src2)>; +def : Pat<(or GR64:$src1, GR64:$src2), (OR64rr GR64:$src1, GR64:$src2)>; + +// or reg/mem +def : Pat<(or GR8:$src1, (loadi8 addr:$src2)), + (OR8rm GR8:$src1, addr:$src2)>; +def : Pat<(or GR16:$src1, (loadi16 addr:$src2)), + (OR16rm GR16:$src1, addr:$src2)>; +def : Pat<(or GR32:$src1, (loadi32 addr:$src2)), + (OR32rm GR32:$src1, addr:$src2)>; +def : Pat<(or GR64:$src1, (loadi64 addr:$src2)), + (OR64rm GR64:$src1, addr:$src2)>; + +// or reg/imm +def : Pat<(or GR8:$src1 , imm:$src2), (OR8ri GR8 :$src1, imm:$src2)>; +def : Pat<(or GR16:$src1, imm:$src2), (OR16ri GR16:$src1, imm:$src2)>; +def : Pat<(or GR32:$src1, imm:$src2), (OR32ri GR32:$src1, imm:$src2)>; +def : Pat<(or GR16:$src1, i16immSExt8:$src2), + (OR16ri8 GR16:$src1, i16immSExt8:$src2)>; +def : Pat<(or GR32:$src1, i32immSExt8:$src2), + (OR32ri8 GR32:$src1, i32immSExt8:$src2)>; +def : Pat<(or GR64:$src1, i64immSExt8:$src2), + (OR64ri8 GR64:$src1, i64immSExt8:$src2)>; +def : Pat<(or GR64:$src1, i64immSExt32:$src2), + (OR64ri32 GR64:$src1, i64immSExt32:$src2)>; + +// xor reg/reg +def : Pat<(xor GR8 :$src1, GR8 :$src2), (XOR8rr GR8 :$src1, GR8 :$src2)>; +def : Pat<(xor GR16:$src1, GR16:$src2), (XOR16rr GR16:$src1, GR16:$src2)>; +def : Pat<(xor GR32:$src1, GR32:$src2), (XOR32rr GR32:$src1, GR32:$src2)>; +def : Pat<(xor GR64:$src1, GR64:$src2), (XOR64rr GR64:$src1, GR64:$src2)>; + +// xor reg/mem +def : Pat<(xor GR8:$src1, (loadi8 addr:$src2)), + (XOR8rm GR8:$src1, addr:$src2)>; +def : Pat<(xor GR16:$src1, (loadi16 addr:$src2)), + (XOR16rm GR16:$src1, addr:$src2)>; +def : Pat<(xor GR32:$src1, (loadi32 addr:$src2)), + (XOR32rm GR32:$src1, addr:$src2)>; +def : Pat<(xor GR64:$src1, (loadi64 addr:$src2)), + (XOR64rm GR64:$src1, addr:$src2)>; + +// xor reg/imm +def : Pat<(xor GR8:$src1, imm:$src2), + (XOR8ri GR8:$src1, imm:$src2)>; +def : Pat<(xor GR16:$src1, imm:$src2), + (XOR16ri GR16:$src1, imm:$src2)>; +def : Pat<(xor GR32:$src1, imm:$src2), + (XOR32ri GR32:$src1, imm:$src2)>; +def : Pat<(xor GR16:$src1, i16immSExt8:$src2), + (XOR16ri8 GR16:$src1, i16immSExt8:$src2)>; +def : Pat<(xor GR32:$src1, i32immSExt8:$src2), + (XOR32ri8 GR32:$src1, i32immSExt8:$src2)>; +def : Pat<(xor GR64:$src1, i64immSExt8:$src2), + (XOR64ri8 GR64:$src1, i64immSExt8:$src2)>; +def : Pat<(xor GR64:$src1, i64immSExt32:$src2), + (XOR64ri32 GR64:$src1, i64immSExt32:$src2)>; + +// and reg/reg +def : Pat<(and GR8 :$src1, GR8 :$src2), (AND8rr GR8 :$src1, GR8 :$src2)>; +def : Pat<(and GR16:$src1, GR16:$src2), (AND16rr GR16:$src1, GR16:$src2)>; +def : Pat<(and GR32:$src1, GR32:$src2), (AND32rr GR32:$src1, GR32:$src2)>; +def : Pat<(and GR64:$src1, GR64:$src2), (AND64rr GR64:$src1, GR64:$src2)>; + +// and reg/mem +def : Pat<(and GR8:$src1, (loadi8 addr:$src2)), + (AND8rm GR8:$src1, addr:$src2)>; +def : Pat<(and GR16:$src1, (loadi16 addr:$src2)), + (AND16rm GR16:$src1, addr:$src2)>; +def : Pat<(and GR32:$src1, (loadi32 addr:$src2)), + (AND32rm GR32:$src1, addr:$src2)>; +def : Pat<(and GR64:$src1, (loadi64 addr:$src2)), + (AND64rm GR64:$src1, addr:$src2)>; + +// and reg/imm +def : Pat<(and GR8:$src1, imm:$src2), + (AND8ri GR8:$src1, imm:$src2)>; +def : Pat<(and GR16:$src1, imm:$src2), + (AND16ri GR16:$src1, imm:$src2)>; +def : Pat<(and GR32:$src1, imm:$src2), + (AND32ri GR32:$src1, imm:$src2)>; +def : Pat<(and GR16:$src1, i16immSExt8:$src2), + (AND16ri8 GR16:$src1, i16immSExt8:$src2)>; +def : Pat<(and GR32:$src1, i32immSExt8:$src2), + (AND32ri8 GR32:$src1, i32immSExt8:$src2)>; +def : Pat<(and GR64:$src1, i64immSExt8:$src2), + (AND64ri8 GR64:$src1, i64immSExt8:$src2)>; +def : Pat<(and GR64:$src1, i64immSExt32:$src2), + (AND64ri32 GR64:$src1, i64immSExt32:$src2)>; + +// Bit scan instruction patterns to match explicit zero-undef behavior. +def : Pat<(cttz_zero_undef GR16:$src), (BSF16rr GR16:$src)>; +def : Pat<(cttz_zero_undef GR32:$src), (BSF32rr GR32:$src)>; +def : Pat<(cttz_zero_undef GR64:$src), (BSF64rr GR64:$src)>; +def : Pat<(cttz_zero_undef (loadi16 addr:$src)), (BSF16rm addr:$src)>; +def : Pat<(cttz_zero_undef (loadi32 addr:$src)), (BSF32rm addr:$src)>; +def : Pat<(cttz_zero_undef (loadi64 addr:$src)), (BSF64rm addr:$src)>; + +// When HasMOVBE is enabled it is possible to get a non-legalized +// register-register 16 bit bswap. This maps it to a ROL instruction. +let Predicates = [HasMOVBE] in { + def : Pat<(bswap GR16:$src), (ROL16ri GR16:$src, (i8 8))>; +} + +// These patterns are selected by some custom code in X86ISelDAGToDAG.cpp that +// custom combines and+srl into BEXTR. We use these patterns to avoid a bunch +// of manual code for folding loads. +let Predicates = [HasBMI, NoTBM] in { + def : Pat<(X86bextr GR32:$src1, (i32 imm:$src2)), + (BEXTR32rr GR32:$src1, (MOV32ri imm:$src2))>; + def : Pat<(X86bextr (loadi32 addr:$src1), (i32 imm:$src2)), + (BEXTR32rm addr:$src1, (MOV32ri imm:$src2))>; + def : Pat<(X86bextr GR64:$src1, mov64imm32:$src2), + (BEXTR64rr GR64:$src1, + (SUBREG_TO_REG (i64 0), + (MOV32ri64 mov64imm32:$src2), + sub_32bit))>; + def : Pat<(X86bextr (loadi64 addr:$src1), mov64imm32:$src2), + (BEXTR64rm addr:$src1, + (SUBREG_TO_REG (i64 0), + (MOV32ri64 mov64imm32:$src2), + sub_32bit))>; +} // HasBMI, NoTBM diff --git a/capstone/suite/synctools/tablegen/X86/back/X86InstrControl.td b/capstone/suite/synctools/tablegen/X86/back/X86InstrControl.td new file mode 100644 index 000000000..7121b0c9a --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86InstrControl.td @@ -0,0 +1,413 @@ +//===-- X86InstrControl.td - Control Flow Instructions -----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 jump, return, call, and related instructions. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Control Flow Instructions. +// + +// Return instructions. +// +// The X86retflag return instructions are variadic because we may add ST0 and +// ST1 arguments when returning values on the x87 stack. +let isTerminator = 1, isReturn = 1, isBarrier = 1, + hasCtrlDep = 1, FPForm = SpecialFP, SchedRW = [WriteJumpLd] in { + def RETL : I <0xC3, RawFrm, (outs), (ins variable_ops), + "ret{l}", []>, OpSize32, Requires<[Not64BitMode]>; + def RETQ : I <0xC3, RawFrm, (outs), (ins variable_ops), + "ret{q}", []>, OpSize32, Requires<[In64BitMode]>; + def RETW : I <0xC3, RawFrm, (outs), (ins), + "ret{w}", []>, OpSize16; + def RETIL : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops), + "ret{l}\t$amt", []>, OpSize32, Requires<[Not64BitMode]>; + def RETIQ : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops), + "ret{q}\t$amt", []>, OpSize32, Requires<[In64BitMode]>; + def RETIW : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt), + "ret{w}\t$amt", []>, OpSize16; + def LRETL : I <0xCB, RawFrm, (outs), (ins), + "{l}ret{l|f}", []>, OpSize32; + def LRETQ : RI <0xCB, RawFrm, (outs), (ins), + "{l}ret{|f}q", []>, Requires<[In64BitMode]>; + def LRETW : I <0xCB, RawFrm, (outs), (ins), + "{l}ret{w|f}", []>, OpSize16; + def LRETIL : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt), + "{l}ret{l|f}\t$amt", []>, OpSize32; + def LRETIQ : RIi16<0xCA, RawFrm, (outs), (ins i16imm:$amt), + "{l}ret{|f}q\t$amt", []>, Requires<[In64BitMode]>; + def LRETIW : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt), + "{l}ret{w|f}\t$amt", []>, OpSize16; + + // The machine return from interrupt instruction, but sometimes we need to + // perform a post-epilogue stack adjustment. Codegen emits the pseudo form + // which expands to include an SP adjustment if necessary. + def IRET16 : I <0xcf, RawFrm, (outs), (ins), "iret{w}", []>, + OpSize16; + def IRET32 : I <0xcf, RawFrm, (outs), (ins), "iret{l|d}", []>, OpSize32; + def IRET64 : RI <0xcf, RawFrm, (outs), (ins), "iretq", []>, Requires<[In64BitMode]>; + // let isCodeGenOnly = 1 in + // def IRET : PseudoI<(outs), (ins i32imm:$adj), [(X86iret timm:$adj)]>; + // def RET : PseudoI<(outs), (ins i32imm:$adj, variable_ops), [(X86retflag timm:$adj)]>; +} + +// Unconditional branches. +let isBarrier = 1, isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in { + def JMP_1 : Ii8PCRel<0xEB, RawFrm, (outs), (ins brtarget8:$dst), + "jmp\t$dst", [(br bb:$dst)]>; + let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in { + def JMP_2 : Ii16PCRel<0xE9, RawFrm, (outs), (ins brtarget16:$dst), + "jmp\t$dst", []>, OpSize16; + def JMP_4 : Ii32PCRel<0xE9, RawFrm, (outs), (ins brtarget32:$dst), + "jmp\t$dst", []>, OpSize32; + } +} + +// Conditional Branches. +let isBranch = 1, isTerminator = 1, Uses = [EFLAGS], SchedRW = [WriteJump] in { + multiclass ICBr<bits<8> opc1, bits<8> opc4, string asm, PatFrag Cond> { + def _1 : Ii8PCRel <opc1, RawFrm, (outs), (ins brtarget8:$dst), asm, + [(X86brcond bb:$dst, Cond, EFLAGS)]>; + let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in { + def _2 : Ii16PCRel<opc4, RawFrm, (outs), (ins brtarget16:$dst), asm, + []>, OpSize16, TB; + def _4 : Ii32PCRel<opc4, RawFrm, (outs), (ins brtarget32:$dst), asm, + []>, TB, OpSize32; + } + } +} + +defm JO : ICBr<0x70, 0x80, "jo\t$dst" , X86_COND_O>; +defm JNO : ICBr<0x71, 0x81, "jno\t$dst", X86_COND_NO>; +defm JB : ICBr<0x72, 0x82, "jb\t$dst" , X86_COND_B>; +defm JAE : ICBr<0x73, 0x83, "jae\t$dst", X86_COND_AE>; +defm JE : ICBr<0x74, 0x84, "je\t$dst" , X86_COND_E>; +defm JNE : ICBr<0x75, 0x85, "jne\t$dst", X86_COND_NE>; +defm JBE : ICBr<0x76, 0x86, "jbe\t$dst", X86_COND_BE>; +defm JA : ICBr<0x77, 0x87, "ja\t$dst" , X86_COND_A>; +defm JS : ICBr<0x78, 0x88, "js\t$dst" , X86_COND_S>; +defm JNS : ICBr<0x79, 0x89, "jns\t$dst", X86_COND_NS>; +defm JP : ICBr<0x7A, 0x8A, "jp\t$dst" , X86_COND_P>; +defm JNP : ICBr<0x7B, 0x8B, "jnp\t$dst", X86_COND_NP>; +defm JL : ICBr<0x7C, 0x8C, "jl\t$dst" , X86_COND_L>; +defm JGE : ICBr<0x7D, 0x8D, "jge\t$dst", X86_COND_GE>; +defm JLE : ICBr<0x7E, 0x8E, "jle\t$dst", X86_COND_LE>; +defm JG : ICBr<0x7F, 0x8F, "jg\t$dst" , X86_COND_G>; + +// jcx/jecx/jrcx instructions. +let isBranch = 1, isTerminator = 1, hasSideEffects = 0, SchedRW = [WriteJump] in { + // These are the 32-bit versions of this instruction for the asmparser. In + // 32-bit mode, the address size prefix is jcxz and the unprefixed version is + // jecxz. + let Uses = [CX] in + def JCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst), + "jcxz\t$dst", []>, AdSize16, Requires<[Not64BitMode]>; + let Uses = [ECX] in + def JECXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst), + "jecxz\t$dst", []>, AdSize32; + + let Uses = [RCX] in + def JRCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst), + "jrcxz\t$dst", []>, AdSize64, Requires<[In64BitMode]>; +} + +// Indirect branches +let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { + def JMP16r : I<0xFF, MRM4r, (outs), (ins GR16:$dst), "jmp{w}\t{*}$dst", + [(brind GR16:$dst)]>, Requires<[Not64BitMode]>, + OpSize16, Sched<[WriteJump]>; + def JMP16m : I<0xFF, MRM4m, (outs), (ins i16mem:$dst), "jmp{w}\t{*}$dst", + [(brind (loadi16 addr:$dst))]>, Requires<[Not64BitMode]>, + OpSize16, Sched<[WriteJumpLd]>; + + def JMP32r : I<0xFF, MRM4r, (outs), (ins GR32:$dst), "jmp{l}\t{*}$dst", + [(brind GR32:$dst)]>, Requires<[Not64BitMode]>, + OpSize32, Sched<[WriteJump]>; + def JMP32m : I<0xFF, MRM4m, (outs), (ins i32mem:$dst), "jmp{l}\t{*}$dst", + [(brind (loadi32 addr:$dst))]>, Requires<[Not64BitMode]>, + OpSize32, Sched<[WriteJumpLd]>; + + def JMP64r : I<0xFF, MRM4r, (outs), (ins GR64:$dst), "jmp{q}\t{*}$dst", + [(brind GR64:$dst)]>, Requires<[In64BitMode]>, + Sched<[WriteJump]>; + def JMP64m : I<0xFF, MRM4m, (outs), (ins i64mem:$dst), "jmp{q}\t{*}$dst", + [(brind (loadi64 addr:$dst))]>, Requires<[In64BitMode]>, + Sched<[WriteJumpLd]>; + + // Non-tracking jumps for IBT, use with caution. + let isCodeGenOnly = 1 in { + def JMP16r_NT : I<0xFF, MRM4r, (outs), (ins GR16 : $dst), "jmp{w}\t{*}$dst", + [(X86NoTrackBrind GR16 : $dst)]>, Requires<[Not64BitMode]>, + OpSize16, Sched<[WriteJump]>, NOTRACK; + + def JMP16m_NT : I<0xFF, MRM4m, (outs), (ins i16mem : $dst), "jmp{w}\t{*}$dst", + [(X86NoTrackBrind (loadi16 addr : $dst))]>, + Requires<[Not64BitMode]>, OpSize16, Sched<[WriteJumpLd]>, + NOTRACK; + + def JMP32r_NT : I<0xFF, MRM4r, (outs), (ins GR32 : $dst), "jmp{l}\t{*}$dst", + [(X86NoTrackBrind GR32 : $dst)]>, Requires<[Not64BitMode]>, + OpSize32, Sched<[WriteJump]>, NOTRACK; + def JMP32m_NT : I<0xFF, MRM4m, (outs), (ins i32mem : $dst), "jmp{l}\t{*}$dst", + [(X86NoTrackBrind (loadi32 addr : $dst))]>, + Requires<[Not64BitMode]>, OpSize32, Sched<[WriteJumpLd]>, + NOTRACK; + + def JMP64r_NT : I<0xFF, MRM4r, (outs), (ins GR64 : $dst), "jmp{q}\t{*}$dst", + [(X86NoTrackBrind GR64 : $dst)]>, Requires<[In64BitMode]>, + Sched<[WriteJump]>, NOTRACK; + def JMP64m_NT : I<0xFF, MRM4m, (outs), (ins i64mem : $dst), "jmp{q}\t{*}$dst", + [(X86NoTrackBrind(loadi64 addr : $dst))]>, + Requires<[In64BitMode]>, Sched<[WriteJumpLd]>, NOTRACK; + } + + let Predicates = [Not64BitMode], AsmVariantName = "att" in { + def FARJMP16i : Iseg16<0xEA, RawFrmImm16, (outs), + (ins i16imm:$off, i16imm:$seg), + "ljmp{w}\t$seg : $off", []>, + OpSize16, Sched<[WriteJump]>; + def FARJMP32i : Iseg32<0xEA, RawFrmImm16, (outs), + (ins i32imm:$off, i16imm:$seg), + "ljmp{l}\t$seg : $off", []>, + OpSize32, Sched<[WriteJump]>; + } + def FARJMP64 : RI<0xFF, MRM5m, (outs), (ins opaquemem:$dst), + "ljmp{q}\t{*}$dst", []>, Sched<[WriteJump]>, Requires<[In64BitMode]>; + + let AsmVariantName = "att" in + def FARJMP16m : I<0xFF, MRM5m, (outs), (ins opaquemem:$dst), + "ljmp{w}\t{*}$dst", []>, OpSize16, Sched<[WriteJumpLd]>; + def FARJMP32m : I<0xFF, MRM5m, (outs), (ins opaquemem:$dst), + "{l}jmp{l}\t{*}$dst", []>, OpSize32, Sched<[WriteJumpLd]>; +} + +// Loop instructions +let SchedRW = [WriteJump] in { +def LOOP : Ii8PCRel<0xE2, RawFrm, (outs), (ins brtarget8:$dst), "loop\t$dst", []>; +def LOOPE : Ii8PCRel<0xE1, RawFrm, (outs), (ins brtarget8:$dst), "loope\t$dst", []>; +def LOOPNE : Ii8PCRel<0xE0, RawFrm, (outs), (ins brtarget8:$dst), "loopne\t$dst", []>; +} + +//===----------------------------------------------------------------------===// +// Call Instructions... +// +let isCall = 1 in + // All calls clobber the non-callee saved registers. ESP is marked as + // a use to prevent stack-pointer assignments that appear immediately + // before calls from potentially appearing dead. Uses for argument + // registers are added manually. + let Uses = [ESP, SSP] in { + def CALLpcrel32 : Ii32PCRel<0xE8, RawFrm, + (outs), (ins i32imm_pcrel:$dst), + "call{l}\t$dst", []>, OpSize32, + Requires<[Not64BitMode]>, Sched<[WriteJump]>; + let hasSideEffects = 0 in + def CALLpcrel16 : Ii16PCRel<0xE8, RawFrm, + (outs), (ins i16imm_pcrel:$dst), + "call{w}\t$dst", []>, OpSize16, + Sched<[WriteJump]>; + def CALL16r : I<0xFF, MRM2r, (outs), (ins GR16:$dst), + "call{w}\t{*}$dst", [(X86call GR16:$dst)]>, + OpSize16, Requires<[Not64BitMode]>, Sched<[WriteJump]>; + def CALL16m : I<0xFF, MRM2m, (outs), (ins i16mem:$dst), + "call{w}\t{*}$dst", [(X86call (loadi16 addr:$dst))]>, + OpSize16, Requires<[Not64BitMode,FavorMemIndirectCall]>, + Sched<[WriteJumpLd]>; + def CALL32r : I<0xFF, MRM2r, (outs), (ins GR32:$dst), + "call{l}\t{*}$dst", [(X86call GR32:$dst)]>, OpSize32, + Requires<[Not64BitMode,NotUseRetpoline]>, Sched<[WriteJump]>; + def CALL32m : I<0xFF, MRM2m, (outs), (ins i32mem:$dst), + "call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))]>, + OpSize32, + Requires<[Not64BitMode,FavorMemIndirectCall,NotUseRetpoline]>, + Sched<[WriteJumpLd]>; + + // Non-tracking calls for IBT, use with caution. + let isCodeGenOnly = 1 in { + def CALL16r_NT : I<0xFF, MRM2r, (outs), (ins GR16 : $dst), + "call{w}\t{*}$dst",[(X86NoTrackCall GR16 : $dst)]>, + OpSize16, Requires<[Not64BitMode]>, Sched<[WriteJump]>, NOTRACK; + def CALL16m_NT : I<0xFF, MRM2m, (outs), (ins i16mem : $dst), + "call{w}\t{*}$dst",[(X86NoTrackCall(loadi16 addr : $dst))]>, + OpSize16, Requires<[Not64BitMode,FavorMemIndirectCall]>, + Sched<[WriteJumpLd]>, NOTRACK; + def CALL32r_NT : I<0xFF, MRM2r, (outs), (ins GR32 : $dst), + "call{l}\t{*}$dst",[(X86NoTrackCall GR32 : $dst)]>, + OpSize32, Requires<[Not64BitMode]>, Sched<[WriteJump]>, NOTRACK; + def CALL32m_NT : I<0xFF, MRM2m, (outs), (ins i32mem : $dst), + "call{l}\t{*}$dst",[(X86NoTrackCall(loadi32 addr : $dst))]>, + OpSize32, Requires<[Not64BitMode,FavorMemIndirectCall]>, + Sched<[WriteJumpLd]>, NOTRACK; + } + + let Predicates = [Not64BitMode], AsmVariantName = "att" in { + def FARCALL16i : Iseg16<0x9A, RawFrmImm16, (outs), + (ins i16imm:$off, i16imm:$seg), + "lcall{w}\t$seg, $off", []>, + OpSize16, Sched<[WriteJump]>; + def FARCALL32i : Iseg32<0x9A, RawFrmImm16, (outs), + (ins i32imm:$off, i16imm:$seg), + "lcall{l}\t$seg, $off", []>, + OpSize32, Sched<[WriteJump]>; + } + + def FARCALL16m : I<0xFF, MRM3m, (outs), (ins opaquemem:$dst), + "lcall{w}\t{*}$dst", []>, OpSize16, Sched<[WriteJumpLd]>; + def FARCALL32m : I<0xFF, MRM3m, (outs), (ins opaquemem:$dst), + "{l}call{l}\t{*}$dst", []>, OpSize32, Sched<[WriteJumpLd]>; + } + + +/* +// Tail call stuff. +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, + isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in + let Uses = [ESP, SSP] in { + def TCRETURNdi : PseudoI<(outs), + (ins i32imm_pcrel:$dst, i32imm:$offset), []>, NotMemoryFoldable; + def TCRETURNri : PseudoI<(outs), + (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>, NotMemoryFoldable; + let mayLoad = 1 in + def TCRETURNmi : PseudoI<(outs), + (ins i32mem_TC:$dst, i32imm:$offset), []>; + + // FIXME: The should be pseudo instructions that are lowered when going to + // mcinst. + def TAILJMPd : Ii32PCRel<0xE9, RawFrm, (outs), + (ins i32imm_pcrel:$dst), "jmp\t$dst", []>; + + def TAILJMPr : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst), + "", []>; // FIXME: Remove encoding when JIT is dead. + let mayLoad = 1 in + def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem_TC:$dst), + "jmp{l}\t{*}$dst", []>; +} + +// Conditional tail calls are similar to the above, but they are branches +// rather than barriers, and they use EFLAGS. +let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1, + isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in + let Uses = [ESP, EFLAGS, SSP] in { + def TCRETURNdicc : PseudoI<(outs), + (ins i32imm_pcrel:$dst, i32imm:$offset, i32imm:$cond), []>; + + // This gets substituted to a conditional jump instruction in MC lowering. + def TAILJMPd_CC : Ii32PCRel<0x80, RawFrm, (outs), + (ins i32imm_pcrel:$dst, i32imm:$cond), "", []>; +} +*/ + + +//===----------------------------------------------------------------------===// +// Call Instructions... +// + +// RSP is marked as a use to prevent stack-pointer assignments that appear +// immediately before calls from potentially appearing dead. Uses for argument +// registers are added manually. +let isCall = 1, Uses = [RSP, SSP], SchedRW = [WriteJump] in { + // NOTE: this pattern doesn't match "X86call imm", because we do not know + // that the offset between an arbitrary immediate and the call will fit in + // the 32-bit pcrel field that we have. + def CALL64pcrel32 : Ii32PCRel<0xE8, RawFrm, + (outs), (ins i64i32imm_pcrel:$dst), + "call{q}\t$dst", []>, OpSize32, + Requires<[In64BitMode]>; + def CALL64r : I<0xFF, MRM2r, (outs), (ins GR64:$dst), + "call{q}\t{*}$dst", [(X86call GR64:$dst)]>, + Requires<[In64BitMode,NotUseRetpoline]>; + def CALL64m : I<0xFF, MRM2m, (outs), (ins i64mem:$dst), + "call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))]>, + Requires<[In64BitMode,FavorMemIndirectCall, + NotUseRetpoline]>; + + // Non-tracking calls for IBT, use with caution. + let isCodeGenOnly = 1 in { + def CALL64r_NT : I<0xFF, MRM2r, (outs), (ins GR64 : $dst), + "call{q}\t{*}$dst",[(X86NoTrackCall GR64 : $dst)]>, + Requires<[In64BitMode]>, NOTRACK; + def CALL64m_NT : I<0xFF, MRM2m, (outs), (ins i64mem : $dst), + "call{q}\t{*}$dst", + [(X86NoTrackCall(loadi64 addr : $dst))]>, + Requires<[In64BitMode,FavorMemIndirectCall]>, NOTRACK; + } + + def FARCALL64 : RI<0xFF, MRM3m, (outs), (ins opaquemem:$dst), + "lcall{q}\t{*}$dst", []>; +} + +/* +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, + isCodeGenOnly = 1, Uses = [RSP, SSP], SchedRW = [WriteJump] in { + def TCRETURNdi64 : PseudoI<(outs), + (ins i64i32imm_pcrel:$dst, i32imm:$offset), + []>; + def TCRETURNri64 : PseudoI<(outs), + (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>, NotMemoryFoldable; + let mayLoad = 1 in + def TCRETURNmi64 : PseudoI<(outs), + (ins i64mem_TC:$dst, i32imm:$offset), []>, NotMemoryFoldable; + + def TAILJMPd64 : Ii32PCRel<0xE9, RawFrm, (outs), (ins i64i32imm_pcrel:$dst), + "jmp\t$dst", []>; + + def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst), + "jmp{q}\t{*}$dst", []>; + + let mayLoad = 1 in + def TAILJMPm64 : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst), + "jmp{q}\t{*}$dst", []>; + + // Win64 wants indirect jumps leaving the function to have a REX_W prefix. + let hasREX_WPrefix = 1 in { + def TAILJMPr64_REX : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst), + "rex64 jmp{q}\t{*}$dst", []>; + + let mayLoad = 1 in + def TAILJMPm64_REX : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst), + "rex64 jmp{q}\t{*}$dst", []>; + } +} + +let isPseudo = 1, isCall = 1, isCodeGenOnly = 1, + Uses = [RSP, SSP], + usesCustomInserter = 1, + SchedRW = [WriteJump] in { + def RETPOLINE_CALL32 : + PseudoI<(outs), (ins GR32:$dst), [(X86call GR32:$dst)]>, + Requires<[Not64BitMode,UseRetpoline]>; + + def RETPOLINE_CALL64 : + PseudoI<(outs), (ins GR64:$dst), [(X86call GR64:$dst)]>, + Requires<[In64BitMode,UseRetpoline]>; + + // Retpoline variant of indirect tail calls. + let isTerminator = 1, isReturn = 1, isBarrier = 1 in { + def RETPOLINE_TCRETURN64 : + PseudoI<(outs), (ins GR64:$dst, i32imm:$offset), []>; + def RETPOLINE_TCRETURN32 : + PseudoI<(outs), (ins GR32:$dst, i32imm:$offset), []>; + } +} + +// Conditional tail calls are similar to the above, but they are branches +// rather than barriers, and they use EFLAGS. +let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1, + isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in + let Uses = [RSP, EFLAGS, SSP] in { + def TCRETURNdi64cc : PseudoI<(outs), + (ins i64i32imm_pcrel:$dst, i32imm:$offset, + i32imm:$cond), []>; + + // This gets substituted to a conditional jump instruction in MC lowering. + def TAILJMPd64_CC : Ii32PCRel<0x80, RawFrm, (outs), + (ins i64i32imm_pcrel:$dst, i32imm:$cond), "", []>; +} +*/ diff --git a/capstone/suite/synctools/tablegen/X86/back/X86InstrExtension.td b/capstone/suite/synctools/tablegen/X86/back/X86InstrExtension.td new file mode 100644 index 000000000..421792c55 --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86InstrExtension.td @@ -0,0 +1,204 @@ +//===-- X86InstrExtension.td - Sign and Zero Extensions ----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the sign and zero extension operations. +// +//===----------------------------------------------------------------------===// + +let hasSideEffects = 0 in { + let Defs = [AX], Uses = [AL] in // AX = signext(AL) + def CBW : I<0x98, RawFrm, (outs), (ins), + "{cbtw|cbw}", []>, OpSize16, Sched<[WriteALU]>; + let Defs = [EAX], Uses = [AX] in // EAX = signext(AX) + def CWDE : I<0x98, RawFrm, (outs), (ins), + "{cwtl|cwde}", []>, OpSize32, Sched<[WriteALU]>; + + let Defs = [AX,DX], Uses = [AX] in // DX:AX = signext(AX) + def CWD : I<0x99, RawFrm, (outs), (ins), + "{cwtd|cwd}", []>, OpSize16, Sched<[WriteALU]>; + let Defs = [EAX,EDX], Uses = [EAX] in // EDX:EAX = signext(EAX) + def CDQ : I<0x99, RawFrm, (outs), (ins), + "{cltd|cdq}", []>, OpSize32, Sched<[WriteALU]>; + + + let Defs = [RAX], Uses = [EAX] in // RAX = signext(EAX) + def CDQE : RI<0x98, RawFrm, (outs), (ins), + "{cltq|cdqe}", []>, Sched<[WriteALU]>; + + let Defs = [RAX,RDX], Uses = [RAX] in // RDX:RAX = signext(RAX) + def CQO : RI<0x99, RawFrm, (outs), (ins), + "{cqto|cqo}", []>, Sched<[WriteALU]>; +} + +// Sign/Zero extenders +let hasSideEffects = 0 in { +def MOVSX16rr8 : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src), + "movs{bw|x}\t{$src, $dst|$dst, $src}", []>, + TB, OpSize16, Sched<[WriteALU]>; +let mayLoad = 1 in +def MOVSX16rm8 : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src), + "movs{bw|x}\t{$src, $dst|$dst, $src}", []>, + TB, OpSize16, Sched<[WriteALULd]>; +} // hasSideEffects = 0 +def MOVSX32rr8 : I<0xBE, MRMSrcReg, (outs GR32:$dst), (ins GR8:$src), + "movs{bl|x}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (sext GR8:$src))]>, TB, + OpSize32, Sched<[WriteALU]>; +def MOVSX32rm8 : I<0xBE, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src), + "movs{bl|x}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (sextloadi32i8 addr:$src))]>, TB, + OpSize32, Sched<[WriteALULd]>; +def MOVSX32rr16: I<0xBF, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src), + "movs{wl|x}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (sext GR16:$src))]>, TB, + OpSize32, Sched<[WriteALU]>; +def MOVSX32rm16: I<0xBF, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src), + "movs{wl|x}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (sextloadi32i16 addr:$src))]>, + OpSize32, TB, Sched<[WriteALULd]>; + +let hasSideEffects = 0 in { +def MOVZX16rr8 : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src), + "movz{bw|x}\t{$src, $dst|$dst, $src}", []>, + TB, OpSize16, Sched<[WriteALU]>; +let mayLoad = 1 in +def MOVZX16rm8 : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src), + "movz{bw|x}\t{$src, $dst|$dst, $src}", []>, + TB, OpSize16, Sched<[WriteALULd]>; +} // hasSideEffects = 0 +def MOVZX32rr8 : I<0xB6, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src), + "movz{bl|x}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (zext GR8:$src))]>, TB, + OpSize32, Sched<[WriteALU]>; +def MOVZX32rm8 : I<0xB6, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src), + "movz{bl|x}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (zextloadi32i8 addr:$src))]>, TB, + OpSize32, Sched<[WriteALULd]>; +def MOVZX32rr16: I<0xB7, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src), + "movz{wl|x}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (zext GR16:$src))]>, TB, + OpSize32, Sched<[WriteALU]>; +def MOVZX32rm16: I<0xB7, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src), + "movz{wl|x}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (zextloadi32i16 addr:$src))]>, + TB, OpSize32, Sched<[WriteALULd]>; + +// These instructions exist as a consequence of operand size prefix having +// control of the destination size, but not the input size. Only support them +// for the disassembler. +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { +def MOVSX16rr16: I<0xBF, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), + "movs{ww|x}\t{$src, $dst|$dst, $src}", + []>, TB, OpSize16, Sched<[WriteALU]>, NotMemoryFoldable; +def MOVZX16rr16: I<0xB7, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), + "movz{ww|x}\t{$src, $dst|$dst, $src}", + []>, TB, OpSize16, Sched<[WriteALU]>, NotMemoryFoldable; +let mayLoad = 1 in { +def MOVSX16rm16: I<0xBF, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "movs{ww|x}\t{$src, $dst|$dst, $src}", + []>, OpSize16, TB, Sched<[WriteALULd]>, NotMemoryFoldable; +def MOVZX16rm16: I<0xB7, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "movz{ww|x}\t{$src, $dst|$dst, $src}", + []>, TB, OpSize16, Sched<[WriteALULd]>, NotMemoryFoldable; +} // mayLoad = 1 +} // isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 + +// These are the same as the regular MOVZX32rr8 and MOVZX32rm8 +// except that they use GR32_NOREX for the output operand register class +// instead of GR32. This allows them to operate on h registers on x86-64. +let hasSideEffects = 0, isCodeGenOnly = 1 in { +def MOVZX32rr8_NOREX : I<0xB6, MRMSrcReg, + (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src), + "movz{bl|x}\t{$src, $dst|$dst, $src}", + []>, TB, OpSize32, Sched<[WriteALU]>; +let mayLoad = 1 in +def MOVZX32rm8_NOREX : I<0xB6, MRMSrcMem, + (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src), + "movz{bl|x}\t{$src, $dst|$dst, $src}", + []>, TB, OpSize32, Sched<[WriteALULd]>; + +def MOVSX32rr8_NOREX : I<0xBE, MRMSrcReg, + (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src), + "movs{bl|x}\t{$src, $dst|$dst, $src}", + []>, TB, OpSize32, Sched<[WriteALU]>; +let mayLoad = 1 in +def MOVSX32rm8_NOREX : I<0xBE, MRMSrcMem, + (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src), + "movs{bl|x}\t{$src, $dst|$dst, $src}", + []>, TB, OpSize32, Sched<[WriteALULd]>; +} + +// MOVSX64rr8 always has a REX prefix and it has an 8-bit register +// operand, which makes it a rare instruction with an 8-bit register +// operand that can never access an h register. If support for h registers +// were generalized, this would require a special register class. +def MOVSX64rr8 : RI<0xBE, MRMSrcReg, (outs GR64:$dst), (ins GR8 :$src), + "movs{bq|x}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (sext GR8:$src))]>, TB, + Sched<[WriteALU]>; +def MOVSX64rm8 : RI<0xBE, MRMSrcMem, (outs GR64:$dst), (ins i8mem :$src), + "movs{bq|x}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (sextloadi64i8 addr:$src))]>, + TB, Sched<[WriteALULd]>; +def MOVSX64rr16: RI<0xBF, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src), + "movs{wq|x}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (sext GR16:$src))]>, TB, + Sched<[WriteALU]>; +def MOVSX64rm16: RI<0xBF, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), + "movs{wq|x}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (sextloadi64i16 addr:$src))]>, + TB, Sched<[WriteALULd]>; +def MOVSX64rr32: RI<0x63, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src), + "movs{lq|xd}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (sext GR32:$src))]>, + Sched<[WriteALU]>, Requires<[In64BitMode]>; +def MOVSX64rm32: RI<0x63, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src), + "movs{lq|xd}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (sextloadi64i32 addr:$src))]>, + Sched<[WriteALULd]>, Requires<[In64BitMode]>; + +// movzbq and movzwq encodings for the disassembler +let hasSideEffects = 0 in { +def MOVZX64rr8 : RI<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8:$src), + "movz{bq|x}\t{$src, $dst|$dst, $src}", []>, + TB, Sched<[WriteALU]>; +let mayLoad = 1 in +def MOVZX64rm8 : RI<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem:$src), + "movz{bq|x}\t{$src, $dst|$dst, $src}", []>, + TB, Sched<[WriteALULd]>; +def MOVZX64rr16 : RI<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src), + "movz{wq|x}\t{$src, $dst|$dst, $src}", []>, + TB, Sched<[WriteALU]>; +let mayLoad = 1 in +def MOVZX64rm16 : RI<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), + "movz{wq|x}\t{$src, $dst|$dst, $src}", []>, + TB, Sched<[WriteALULd]>; +} + +// 64-bit zero-extension patterns use SUBREG_TO_REG and an operation writing a +// 32-bit register. +def : Pat<(i64 (zext GR8:$src)), + (SUBREG_TO_REG (i64 0), (MOVZX32rr8 GR8:$src), sub_32bit)>; +def : Pat<(zextloadi64i8 addr:$src), + (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>; + +def : Pat<(i64 (zext GR16:$src)), + (SUBREG_TO_REG (i64 0), (MOVZX32rr16 GR16:$src), sub_32bit)>; +def : Pat<(zextloadi64i16 addr:$src), + (SUBREG_TO_REG (i64 0), (MOVZX32rm16 addr:$src), sub_32bit)>; + +// The preferred way to do 32-bit-to-64-bit zero extension on x86-64 is to use a +// SUBREG_TO_REG to utilize implicit zero-extension, however this isn't possible +// when the 32-bit value is defined by a truncate or is copied from something +// where the high bits aren't necessarily all zero. In such cases, we fall back +// to these explicit zext instructions. +def : Pat<(i64 (zext GR32:$src)), + (SUBREG_TO_REG (i64 0), (MOV32rr GR32:$src), sub_32bit)>; +def : Pat<(i64 (zextloadi64i32 addr:$src)), + (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>; diff --git a/capstone/suite/synctools/tablegen/X86/back/X86InstrFMA.td b/capstone/suite/synctools/tablegen/X86/back/X86InstrFMA.td new file mode 100644 index 000000000..a559f62c8 --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86InstrFMA.td @@ -0,0 +1,636 @@ +//===-- X86InstrFMA.td - FMA Instruction Set ---------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes FMA (Fused Multiply-Add) instructions. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// FMA3 - Intel 3 operand Fused Multiply-Add instructions +//===----------------------------------------------------------------------===// + +// For all FMA opcodes declared in fma3p_rm_* and fma3s_rm_* multiclasses +// defined below, both the register and memory variants are commutable. +// For the register form the commutable operands are 1, 2 and 3. +// For the memory variant the folded operand must be in 3. Thus, +// in that case, only the operands 1 and 2 can be swapped. +// Commuting some of operands may require the opcode change. +// FMA*213*: +// operands 1 and 2 (memory & register forms): *213* --> *213*(no changes); +// operands 1 and 3 (register forms only): *213* --> *231*; +// operands 2 and 3 (register forms only): *213* --> *132*. +// FMA*132*: +// operands 1 and 2 (memory & register forms): *132* --> *231*; +// operands 1 and 3 (register forms only): *132* --> *132*(no changes); +// operands 2 and 3 (register forms only): *132* --> *213*. +// FMA*231*: +// operands 1 and 2 (memory & register forms): *231* --> *132*; +// operands 1 and 3 (register forms only): *231* --> *213*; +// operands 2 and 3 (register forms only): *231* --> *231*(no changes). + +multiclass fma3p_rm_213<bits<8> opc, string OpcodeStr, RegisterClass RC, + ValueType VT, X86MemOperand x86memop, PatFrag MemFrag, + SDNode Op, X86FoldableSchedWrite sched> { + def r : FMA3<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set RC:$dst, (VT (Op RC:$src2, RC:$src1, RC:$src3)))]>, + Sched<[sched]>; + + let mayLoad = 1 in + def m : FMA3<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, RC:$src2, x86memop:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set RC:$dst, (VT (Op RC:$src2, RC:$src1, + (MemFrag addr:$src3))))]>, + Sched<[sched.Folded, ReadAfterLd, ReadAfterLd]>; +} + +multiclass fma3p_rm_231<bits<8> opc, string OpcodeStr, RegisterClass RC, + ValueType VT, X86MemOperand x86memop, PatFrag MemFrag, + SDNode Op, X86FoldableSchedWrite sched> { + let hasSideEffects = 0 in + def r : FMA3<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + []>, Sched<[sched]>; + + let mayLoad = 1 in + def m : FMA3<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, RC:$src2, x86memop:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set RC:$dst, (VT (Op RC:$src2, (MemFrag addr:$src3), + RC:$src1)))]>, + Sched<[sched.Folded, ReadAfterLd, ReadAfterLd]>; +} + +multiclass fma3p_rm_132<bits<8> opc, string OpcodeStr, RegisterClass RC, + ValueType VT, X86MemOperand x86memop, PatFrag MemFrag, + SDNode Op, X86FoldableSchedWrite sched> { + let hasSideEffects = 0 in + def r : FMA3<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + []>, Sched<[sched]>; + + // Pattern is 312 order so that the load is in a different place from the + // 213 and 231 patterns this helps tablegen's duplicate pattern detection. + let mayLoad = 1 in + def m : FMA3<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, RC:$src2, x86memop:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set RC:$dst, (VT (Op (MemFrag addr:$src3), RC:$src1, + RC:$src2)))]>, + Sched<[sched.Folded, ReadAfterLd, ReadAfterLd]>; +} + +let Constraints = "$src1 = $dst", hasSideEffects = 0, isCommutable = 1 in +multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, + string OpcodeStr, string PackTy, string Suff, + PatFrag MemFrag128, PatFrag MemFrag256, + SDNode Op, ValueType OpTy128, ValueType OpTy256, + X86SchedWriteWidths sched> { + defm NAME#213#Suff : fma3p_rm_213<opc213, !strconcat(OpcodeStr, "213", PackTy), + VR128, OpTy128, f128mem, MemFrag128, Op, sched.XMM>; + defm NAME#231#Suff : fma3p_rm_231<opc231, !strconcat(OpcodeStr, "231", PackTy), + VR128, OpTy128, f128mem, MemFrag128, Op, sched.XMM>; + defm NAME#132#Suff : fma3p_rm_132<opc132, !strconcat(OpcodeStr, "132", PackTy), + VR128, OpTy128, f128mem, MemFrag128, Op, sched.XMM>; + + defm NAME#213#Suff#Y : fma3p_rm_213<opc213, !strconcat(OpcodeStr, "213", PackTy), + VR256, OpTy256, f256mem, MemFrag256, Op, sched.YMM>, + VEX_L; + defm NAME#231#Suff#Y : fma3p_rm_231<opc231, !strconcat(OpcodeStr, "231", PackTy), + VR256, OpTy256, f256mem, MemFrag256, Op, sched.YMM>, + VEX_L; + defm NAME#132#Suff#Y : fma3p_rm_132<opc132, !strconcat(OpcodeStr, "132", PackTy), + VR256, OpTy256, f256mem, MemFrag256, Op, sched.YMM>, + VEX_L; +} + +// Fused Multiply-Add +let ExeDomain = SSEPackedSingle in { + defm VFMADD : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", "PS", + loadv4f32, loadv8f32, X86Fmadd, v4f32, v8f32, + SchedWriteFMA>; + defm VFMSUB : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", "PS", + loadv4f32, loadv8f32, X86Fmsub, v4f32, v8f32, + SchedWriteFMA>; + defm VFMADDSUB : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps", "PS", + loadv4f32, loadv8f32, X86Fmaddsub, v4f32, v8f32, + SchedWriteFMA>; + defm VFMSUBADD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps", "PS", + loadv4f32, loadv8f32, X86Fmsubadd, v4f32, v8f32, + SchedWriteFMA>; +} + +let ExeDomain = SSEPackedDouble in { + defm VFMADD : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd", "PD", + loadv2f64, loadv4f64, X86Fmadd, v2f64, + v4f64, SchedWriteFMA>, VEX_W; + defm VFMSUB : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", "PD", + loadv2f64, loadv4f64, X86Fmsub, v2f64, + v4f64, SchedWriteFMA>, VEX_W; + defm VFMADDSUB : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd", "PD", + loadv2f64, loadv4f64, X86Fmaddsub, + v2f64, v4f64, SchedWriteFMA>, VEX_W; + defm VFMSUBADD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd", "PD", + loadv2f64, loadv4f64, X86Fmsubadd, + v2f64, v4f64, SchedWriteFMA>, VEX_W; +} + +// Fused Negative Multiply-Add +let ExeDomain = SSEPackedSingle in { + defm VFNMADD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps", "PS", loadv4f32, + loadv8f32, X86Fnmadd, v4f32, v8f32, SchedWriteFMA>; + defm VFNMSUB : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps", "PS", loadv4f32, + loadv8f32, X86Fnmsub, v4f32, v8f32, SchedWriteFMA>; +} +let ExeDomain = SSEPackedDouble in { + defm VFNMADD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", "PD", loadv2f64, + loadv4f64, X86Fnmadd, v2f64, v4f64, SchedWriteFMA>, VEX_W; + defm VFNMSUB : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd", "PD", loadv2f64, + loadv4f64, X86Fnmsub, v2f64, v4f64, SchedWriteFMA>, VEX_W; +} + +// All source register operands of FMA opcodes defined in fma3s_rm multiclass +// can be commuted. In many cases such commute transformation requres an opcode +// adjustment, for example, commuting the operands 1 and 2 in FMA*132 form +// would require an opcode change to FMA*231: +// FMA*132* reg1, reg2, reg3; // reg1 * reg3 + reg2; +// --> +// FMA*231* reg2, reg1, reg3; // reg1 * reg3 + reg2; +// Please see more detailed comment at the very beginning of the section +// defining FMA3 opcodes above. +multiclass fma3s_rm_213<bits<8> opc, string OpcodeStr, + X86MemOperand x86memop, RegisterClass RC, + SDPatternOperator OpNode, + X86FoldableSchedWrite sched> { + def r : FMA3S<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set RC:$dst, (OpNode RC:$src2, RC:$src1, RC:$src3))]>, + Sched<[sched]>; + + let mayLoad = 1 in + def m : FMA3S<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, RC:$src2, x86memop:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set RC:$dst, + (OpNode RC:$src2, RC:$src1, (load addr:$src3)))]>, + Sched<[sched.Folded, ReadAfterLd, ReadAfterLd]>; +} + +multiclass fma3s_rm_231<bits<8> opc, string OpcodeStr, + X86MemOperand x86memop, RegisterClass RC, + SDPatternOperator OpNode, X86FoldableSchedWrite sched> { + let hasSideEffects = 0 in + def r : FMA3S<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + []>, Sched<[sched]>; + + let mayLoad = 1 in + def m : FMA3S<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, RC:$src2, x86memop:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set RC:$dst, + (OpNode RC:$src2, (load addr:$src3), RC:$src1))]>, + Sched<[sched.Folded, ReadAfterLd, ReadAfterLd]>; +} + +multiclass fma3s_rm_132<bits<8> opc, string OpcodeStr, + X86MemOperand x86memop, RegisterClass RC, + SDPatternOperator OpNode, X86FoldableSchedWrite sched> { + let hasSideEffects = 0 in + def r : FMA3S<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + []>, Sched<[sched]>; + + // Pattern is 312 order so that the load is in a different place from the + // 213 and 231 patterns this helps tablegen's duplicate pattern detection. + let mayLoad = 1 in + def m : FMA3S<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, RC:$src2, x86memop:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set RC:$dst, + (OpNode (load addr:$src3), RC:$src1, RC:$src2))]>, + Sched<[sched.Folded, ReadAfterLd, ReadAfterLd]>; +} + +let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0 in +multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, + string OpStr, string PackTy, string Suff, + SDNode OpNode, RegisterClass RC, + X86MemOperand x86memop, X86FoldableSchedWrite sched> { + defm NAME#213#Suff : fma3s_rm_213<opc213, !strconcat(OpStr, "213", PackTy), + x86memop, RC, OpNode, sched>; + defm NAME#231#Suff : fma3s_rm_231<opc231, !strconcat(OpStr, "231", PackTy), + x86memop, RC, OpNode, sched>; + defm NAME#132#Suff : fma3s_rm_132<opc132, !strconcat(OpStr, "132", PackTy), + x86memop, RC, OpNode, sched>; +} + +// These FMA*_Int instructions are defined specially for being used when +// the scalar FMA intrinsics are lowered to machine instructions, and in that +// sense, they are similar to existing ADD*_Int, SUB*_Int, MUL*_Int, etc. +// instructions. +// +// All of the FMA*_Int opcodes are defined as commutable here. +// Commuting the 2nd and 3rd source register operands of FMAs is quite trivial +// and the corresponding optimizations have been developed. +// Commuting the 1st operand of FMA*_Int requires some additional analysis, +// the commute optimization is legal only if all users of FMA*_Int use only +// the lowest element of the FMA*_Int instruction. Even though such analysis +// may be not implemented yet we allow the routines doing the actual commute +// transformation to decide if one or another instruction is commutable or not. +let Constraints = "$src1 = $dst", isCommutable = 1, isCodeGenOnly = 1, + hasSideEffects = 0 in +multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr, + Operand memopr, RegisterClass RC, + X86FoldableSchedWrite sched> { + def r_Int : FMA3S_Int<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + []>, Sched<[sched]>; + + let mayLoad = 1 in + def m_Int : FMA3S_Int<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, RC:$src2, memopr:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + []>, Sched<[sched.Folded, ReadAfterLd, ReadAfterLd]>; +} + +// The FMA 213 form is created for lowering of scalar FMA intrinscis +// to machine instructions. +// The FMA 132 form can trivially be get by commuting the 2nd and 3rd operands +// of FMA 213 form. +// The FMA 231 form can be get only by commuting the 1st operand of 213 or 132 +// forms and is possible only after special analysis of all uses of the initial +// instruction. Such analysis do not exist yet and thus introducing the 231 +// form of FMA*_Int instructions is done using an optimistic assumption that +// such analysis will be implemented eventually. +multiclass fma3s_int_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, + string OpStr, string PackTy, string Suff, + RegisterClass RC, Operand memop, + X86FoldableSchedWrite sched> { + defm NAME#132#Suff : fma3s_rm_int<opc132, !strconcat(OpStr, "132", PackTy), + memop, RC, sched>; + defm NAME#213#Suff : fma3s_rm_int<opc213, !strconcat(OpStr, "213", PackTy), + memop, RC, sched>; + defm NAME#231#Suff : fma3s_rm_int<opc231, !strconcat(OpStr, "231", PackTy), + memop, RC, sched>; +} + +multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231, + string OpStr, SDNode OpNode, X86FoldableSchedWrite sched> { + let ExeDomain = SSEPackedSingle in + defm NAME : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", "SS", OpNode, + FR32, f32mem, sched>, + fma3s_int_forms<opc132, opc213, opc231, OpStr, "ss", "SS", + VR128, ssmem, sched>; + + let ExeDomain = SSEPackedDouble in + defm NAME : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", "SD", OpNode, + FR64, f64mem, sched>, + fma3s_int_forms<opc132, opc213, opc231, OpStr, "sd", "SD", + VR128, sdmem, sched>, VEX_W; +} + +defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", X86Fmadd, + SchedWriteFMA.Scl>, VEX_LIG; +defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", X86Fmsub, + SchedWriteFMA.Scl>, VEX_LIG; + +defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", X86Fnmadd, + SchedWriteFMA.Scl>, VEX_LIG; +defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", X86Fnmsub, + SchedWriteFMA.Scl>, VEX_LIG; + +multiclass scalar_fma_patterns<SDNode Op, string Prefix, string Suffix, + SDNode Move, ValueType VT, ValueType EltVT, + RegisterClass RC, PatFrag mem_frag> { + let Predicates = [HasFMA, NoAVX512] in { + def : Pat<(VT (Move (VT VR128:$src1), (VT (scalar_to_vector + (Op RC:$src2, + (EltVT (extractelt (VT VR128:$src1), (iPTR 0))), + RC:$src3))))), + (!cast<Instruction>(Prefix#"213"#Suffix#"r_Int") + VR128:$src1, (VT (COPY_TO_REGCLASS RC:$src2, VR128)), + (VT (COPY_TO_REGCLASS RC:$src3, VR128)))>; + + def : Pat<(VT (Move (VT VR128:$src1), (VT (scalar_to_vector + (Op RC:$src2, RC:$src3, + (EltVT (extractelt (VT VR128:$src1), (iPTR 0)))))))), + (!cast<Instruction>(Prefix#"231"#Suffix#"r_Int") + VR128:$src1, (VT (COPY_TO_REGCLASS RC:$src2, VR128)), + (VT (COPY_TO_REGCLASS RC:$src3, VR128)))>; + + def : Pat<(VT (Move (VT VR128:$src1), (VT (scalar_to_vector + (Op RC:$src2, + (EltVT (extractelt (VT VR128:$src1), (iPTR 0))), + (mem_frag addr:$src3)))))), + (!cast<Instruction>(Prefix#"213"#Suffix#"m_Int") + VR128:$src1, (VT (COPY_TO_REGCLASS RC:$src2, VR128)), + addr:$src3)>; + + def : Pat<(VT (Move (VT VR128:$src1), (VT (scalar_to_vector + (Op (EltVT (extractelt (VT VR128:$src1), (iPTR 0))), + (mem_frag addr:$src3), RC:$src2))))), + (!cast<Instruction>(Prefix#"132"#Suffix#"m_Int") + VR128:$src1, (VT (COPY_TO_REGCLASS RC:$src2, VR128)), + addr:$src3)>; + + def : Pat<(VT (Move (VT VR128:$src1), (VT (scalar_to_vector + (Op RC:$src2, (mem_frag addr:$src3), + (EltVT (extractelt (VT VR128:$src1), (iPTR 0)))))))), + (!cast<Instruction>(Prefix#"231"#Suffix#"m_Int") + VR128:$src1, (VT (COPY_TO_REGCLASS RC:$src2, VR128)), + addr:$src3)>; + } +} + +defm : scalar_fma_patterns<X86Fmadd, "VFMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>; +defm : scalar_fma_patterns<X86Fmsub, "VFMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>; +defm : scalar_fma_patterns<X86Fnmadd, "VFNMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>; +defm : scalar_fma_patterns<X86Fnmsub, "VFNMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>; + +defm : scalar_fma_patterns<X86Fmadd, "VFMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>; +defm : scalar_fma_patterns<X86Fmsub, "VFMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>; +defm : scalar_fma_patterns<X86Fnmadd, "VFNMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>; +defm : scalar_fma_patterns<X86Fnmsub, "VFNMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>; + +//===----------------------------------------------------------------------===// +// FMA4 - AMD 4 operand Fused Multiply-Add instructions +//===----------------------------------------------------------------------===// + +multiclass fma4s<bits<8> opc, string OpcodeStr, RegisterClass RC, + X86MemOperand x86memop, ValueType OpVT, SDNode OpNode, + PatFrag mem_frag, X86FoldableSchedWrite sched> { + let isCommutable = 1 in + def rr : FMA4S<opc, MRMSrcRegOp4, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set RC:$dst, + (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, VEX_W, VEX_LIG, + Sched<[sched]>; + def rm : FMA4S<opc, MRMSrcMemOp4, (outs RC:$dst), + (ins RC:$src1, RC:$src2, x86memop:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set RC:$dst, (OpNode RC:$src1, RC:$src2, + (mem_frag addr:$src3)))]>, VEX_W, VEX_LIG, + Sched<[sched.Folded, ReadAfterLd, ReadAfterLd]>; + def mr : FMA4S<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2, RC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set RC:$dst, + (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3))]>, VEX_LIG, + Sched<[sched.Folded, ReadAfterLd, + // x86memop:$src2 + ReadDefault, ReadDefault, ReadDefault, ReadDefault, + ReadDefault, + // RC:$src3 + ReadAfterLd]>; +// For disassembler +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in + def rr_REV : FMA4S<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>, + VEX_LIG, FoldGenData<NAME#rr>, Sched<[sched]>; +} + +multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop, + ValueType VT, X86FoldableSchedWrite sched> { +let isCodeGenOnly = 1, hasSideEffects = 0 in { + def rr_Int : FMA4S_Int<opc, MRMSrcRegOp4, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, VEX_W, VEX_LIG, Sched<[sched]>; + let mayLoad = 1 in + def rm_Int : FMA4S_Int<opc, MRMSrcMemOp4, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, memop:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, VEX_W, VEX_LIG, + Sched<[sched.Folded, ReadAfterLd, ReadAfterLd]>; + let mayLoad = 1 in + def mr_Int : FMA4S_Int<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, memop:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, + VEX_LIG, Sched<[sched.Folded, ReadAfterLd, + // memop:$src2 + ReadDefault, ReadDefault, ReadDefault, + ReadDefault, ReadDefault, + // VR128::$src3 + ReadAfterLd]>; + def rr_Int_REV : FMA4S_Int<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, VEX_LIG, FoldGenData<NAME#rr_Int>, Sched<[sched]>; +} // isCodeGenOnly = 1 +} + +multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType OpVT128, ValueType OpVT256, + PatFrag ld_frag128, PatFrag ld_frag256, + X86SchedWriteWidths sched> { + let isCommutable = 1 in + def rr : FMA4<opc, MRMSrcRegOp4, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (OpVT128 (OpNode VR128:$src1, VR128:$src2, VR128:$src3)))]>, + VEX_W, Sched<[sched.XMM]>; + def rm : FMA4<opc, MRMSrcMemOp4, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, f128mem:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, (OpNode VR128:$src1, VR128:$src2, + (ld_frag128 addr:$src3)))]>, VEX_W, + Sched<[sched.XMM.Folded, ReadAfterLd, ReadAfterLd]>; + def mr : FMA4<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, f128mem:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (OpNode VR128:$src1, (ld_frag128 addr:$src2), VR128:$src3))]>, + Sched<[sched.XMM.Folded, ReadAfterLd, + // f128mem:$src2 + ReadDefault, ReadDefault, ReadDefault, ReadDefault, + ReadDefault, + // VR128::$src3 + ReadAfterLd]>; + let isCommutable = 1 in + def Yrr : FMA4<opc, MRMSrcRegOp4, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, VR256:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR256:$dst, + (OpVT256 (OpNode VR256:$src1, VR256:$src2, VR256:$src3)))]>, + VEX_W, VEX_L, Sched<[sched.YMM]>; + def Yrm : FMA4<opc, MRMSrcMemOp4, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, f256mem:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR256:$dst, (OpNode VR256:$src1, VR256:$src2, + (ld_frag256 addr:$src3)))]>, VEX_W, VEX_L, + Sched<[sched.YMM.Folded, ReadAfterLd, ReadAfterLd]>; + def Ymr : FMA4<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, f256mem:$src2, VR256:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR256:$dst, (OpNode VR256:$src1, + (ld_frag256 addr:$src2), VR256:$src3))]>, VEX_L, + Sched<[sched.YMM.Folded, ReadAfterLd, + // f256mem:$src2 + ReadDefault, ReadDefault, ReadDefault, ReadDefault, + ReadDefault, + // VR256::$src3 + ReadAfterLd]>; +// For disassembler +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { + def rr_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>, + Sched<[sched.XMM]>, FoldGenData<NAME#rr>; + def Yrr_REV : FMA4<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, VR256:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>, + VEX_L, Sched<[sched.YMM]>, FoldGenData<NAME#Yrr>; +} // isCodeGenOnly = 1 +} + +let ExeDomain = SSEPackedSingle in { + // Scalar Instructions + defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32, + SchedWriteFMA.Scl>, + fma4s_int<0x6A, "vfmaddss", ssmem, v4f32, + SchedWriteFMA.Scl>; + defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32, + SchedWriteFMA.Scl>, + fma4s_int<0x6E, "vfmsubss", ssmem, v4f32, + SchedWriteFMA.Scl>; + defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32, + X86Fnmadd, loadf32, SchedWriteFMA.Scl>, + fma4s_int<0x7A, "vfnmaddss", ssmem, v4f32, + SchedWriteFMA.Scl>; + defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32, + X86Fnmsub, loadf32, SchedWriteFMA.Scl>, + fma4s_int<0x7E, "vfnmsubss", ssmem, v4f32, + SchedWriteFMA.Scl>; + // Packed Instructions + defm VFMADDPS4 : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32, + loadv4f32, loadv8f32, SchedWriteFMA>; + defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", X86Fmsub, v4f32, v8f32, + loadv4f32, loadv8f32, SchedWriteFMA>; + defm VFNMADDPS4 : fma4p<0x78, "vfnmaddps", X86Fnmadd, v4f32, v8f32, + loadv4f32, loadv8f32, SchedWriteFMA>; + defm VFNMSUBPS4 : fma4p<0x7C, "vfnmsubps", X86Fnmsub, v4f32, v8f32, + loadv4f32, loadv8f32, SchedWriteFMA>; + defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", X86Fmaddsub, v4f32, v8f32, + loadv4f32, loadv8f32, SchedWriteFMA>; + defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps", X86Fmsubadd, v4f32, v8f32, + loadv4f32, loadv8f32, SchedWriteFMA>; +} + +let ExeDomain = SSEPackedDouble in { + // Scalar Instructions + defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64, + SchedWriteFMA.Scl>, + fma4s_int<0x6B, "vfmaddsd", sdmem, v2f64, + SchedWriteFMA.Scl>; + defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64, + SchedWriteFMA.Scl>, + fma4s_int<0x6F, "vfmsubsd", sdmem, v2f64, + SchedWriteFMA.Scl>; + defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64, + X86Fnmadd, loadf64, SchedWriteFMA.Scl>, + fma4s_int<0x7B, "vfnmaddsd", sdmem, v2f64, + SchedWriteFMA.Scl>; + defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64, + X86Fnmsub, loadf64, SchedWriteFMA.Scl>, + fma4s_int<0x7F, "vfnmsubsd", sdmem, v2f64, + SchedWriteFMA.Scl>; + // Packed Instructions + defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64, + loadv2f64, loadv4f64, SchedWriteFMA>; + defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", X86Fmsub, v2f64, v4f64, + loadv2f64, loadv4f64, SchedWriteFMA>; + defm VFNMADDPD4 : fma4p<0x79, "vfnmaddpd", X86Fnmadd, v2f64, v4f64, + loadv2f64, loadv4f64, SchedWriteFMA>; + defm VFNMSUBPD4 : fma4p<0x7D, "vfnmsubpd", X86Fnmsub, v2f64, v4f64, + loadv2f64, loadv4f64, SchedWriteFMA>; + defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", X86Fmaddsub, v2f64, v4f64, + loadv2f64, loadv4f64, SchedWriteFMA>; + defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", X86Fmsubadd, v2f64, v4f64, + loadv2f64, loadv4f64, SchedWriteFMA>; +} + +multiclass scalar_fma4_patterns<SDNode Op, string Name, + ValueType VT, ValueType EltVT, + RegisterClass RC, PatFrag mem_frag> { + let Predicates = [HasFMA4] in { + def : Pat<(VT (X86vzmovl (VT (scalar_to_vector + (Op RC:$src1, RC:$src2, RC:$src3))))), + (!cast<Instruction>(Name#"rr_Int") + (VT (COPY_TO_REGCLASS RC:$src1, VR128)), + (VT (COPY_TO_REGCLASS RC:$src2, VR128)), + (VT (COPY_TO_REGCLASS RC:$src3, VR128)))>; + + def : Pat<(VT (X86vzmovl (VT (scalar_to_vector + (Op RC:$src1, RC:$src2, + (mem_frag addr:$src3)))))), + (!cast<Instruction>(Name#"rm_Int") + (VT (COPY_TO_REGCLASS RC:$src1, VR128)), + (VT (COPY_TO_REGCLASS RC:$src2, VR128)), addr:$src3)>; + + def : Pat<(VT (X86vzmovl (VT (scalar_to_vector + (Op RC:$src1, (mem_frag addr:$src2), + RC:$src3))))), + (!cast<Instruction>(Name#"mr_Int") + (VT (COPY_TO_REGCLASS RC:$src1, VR128)), addr:$src2, + (VT (COPY_TO_REGCLASS RC:$src3, VR128)))>; + } +} + +defm : scalar_fma4_patterns<X86Fmadd, "VFMADDSS4", v4f32, f32, FR32, loadf32>; +defm : scalar_fma4_patterns<X86Fmsub, "VFMSUBSS4", v4f32, f32, FR32, loadf32>; +defm : scalar_fma4_patterns<X86Fnmadd, "VFNMADDSS4", v4f32, f32, FR32, loadf32>; +defm : scalar_fma4_patterns<X86Fnmsub, "VFNMSUBSS4", v4f32, f32, FR32, loadf32>; + +defm : scalar_fma4_patterns<X86Fmadd, "VFMADDSD4", v2f64, f64, FR64, loadf64>; +defm : scalar_fma4_patterns<X86Fmsub, "VFMSUBSD4", v2f64, f64, FR64, loadf64>; +defm : scalar_fma4_patterns<X86Fnmadd, "VFNMADDSD4", v2f64, f64, FR64, loadf64>; +defm : scalar_fma4_patterns<X86Fnmsub, "VFNMSUBSD4", v2f64, f64, FR64, loadf64>; diff --git a/capstone/suite/synctools/tablegen/X86/back/X86InstrFPStack.td b/capstone/suite/synctools/tablegen/X86/back/X86InstrFPStack.td new file mode 100644 index 000000000..b0c9bd163 --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86InstrFPStack.td @@ -0,0 +1,748 @@ +//===- X86InstrFPStack.td - FPU Instruction Set ------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 x87 FPU instruction set, defining the +// instructions, and properties of the instructions which are needed for code +// generation, machine code emission, and analysis. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// FPStack specific DAG Nodes. +//===----------------------------------------------------------------------===// + +def SDTX86FpGet2 : SDTypeProfile<2, 0, [SDTCisVT<0, f80>, + SDTCisVT<1, f80>]>; +def SDTX86Fld : SDTypeProfile<1, 2, [SDTCisFP<0>, + SDTCisPtrTy<1>, + SDTCisVT<2, OtherVT>]>; +def SDTX86Fst : SDTypeProfile<0, 3, [SDTCisFP<0>, + SDTCisPtrTy<1>, + SDTCisVT<2, OtherVT>]>; +def SDTX86Fild : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisPtrTy<1>, + SDTCisVT<2, OtherVT>]>; +def SDTX86Fnstsw : SDTypeProfile<1, 1, [SDTCisVT<0, i16>, SDTCisVT<1, i16>]>; +def SDTX86FpToIMem : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>; + +def SDTX86CwdStore : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; + +def X86fld : SDNode<"X86ISD::FLD", SDTX86Fld, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def X86fst : SDNode<"X86ISD::FST", SDTX86Fst, + [SDNPHasChain, SDNPInGlue, SDNPMayStore, + SDNPMemOperand]>; +def X86fild : SDNode<"X86ISD::FILD", SDTX86Fild, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def X86fildflag : SDNode<"X86ISD::FILD_FLAG", SDTX86Fild, + [SDNPHasChain, SDNPOutGlue, SDNPMayLoad, + SDNPMemOperand]>; +def X86fp_stsw : SDNode<"X86ISD::FNSTSW16r", SDTX86Fnstsw>; +def X86fp_to_i16mem : SDNode<"X86ISD::FP_TO_INT16_IN_MEM", SDTX86FpToIMem, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def X86fp_to_i32mem : SDNode<"X86ISD::FP_TO_INT32_IN_MEM", SDTX86FpToIMem, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def X86fp_to_i64mem : SDNode<"X86ISD::FP_TO_INT64_IN_MEM", SDTX86FpToIMem, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def X86fp_cwd_get16 : SDNode<"X86ISD::FNSTCW16m", SDTX86CwdStore, + [SDNPHasChain, SDNPMayStore, SDNPSideEffect, + SDNPMemOperand]>; + +//===----------------------------------------------------------------------===// +// FPStack pattern fragments +//===----------------------------------------------------------------------===// + +def fpimm0 : FPImmLeaf<fAny, [{ + return Imm.isExactlyValue(+0.0); +}]>; + +def fpimmneg0 : FPImmLeaf<fAny, [{ + return Imm.isExactlyValue(-0.0); +}]>; + +def fpimm1 : FPImmLeaf<fAny, [{ + return Imm.isExactlyValue(+1.0); +}]>; + +def fpimmneg1 : FPImmLeaf<fAny, [{ + return Imm.isExactlyValue(-1.0); +}]>; + +/* +// Some 'special' instructions - expanded after instruction selection. +let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in { + def FP32_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP32:$src), + [(X86fp_to_i16mem RFP32:$src, addr:$dst)]>; + def FP32_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP32:$src), + [(X86fp_to_i32mem RFP32:$src, addr:$dst)]>; + def FP32_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP32:$src), + [(X86fp_to_i64mem RFP32:$src, addr:$dst)]>; + def FP64_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP64:$src), + [(X86fp_to_i16mem RFP64:$src, addr:$dst)]>; + def FP64_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP64:$src), + [(X86fp_to_i32mem RFP64:$src, addr:$dst)]>; + def FP64_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP64:$src), + [(X86fp_to_i64mem RFP64:$src, addr:$dst)]>; + def FP80_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP80:$src), + [(X86fp_to_i16mem RFP80:$src, addr:$dst)]>; + def FP80_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP80:$src), + [(X86fp_to_i32mem RFP80:$src, addr:$dst)]>; + def FP80_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP80:$src), + [(X86fp_to_i64mem RFP80:$src, addr:$dst)]>; +} +*/ + +// All FP Stack operations are represented with four instructions here. The +// first three instructions, generated by the instruction selector, use "RFP32" +// "RFP64" or "RFP80" registers: traditional register files to reference 32-bit, +// 64-bit or 80-bit floating point values. These sizes apply to the values, +// not the registers, which are always 80 bits; RFP32, RFP64 and RFP80 can be +// copied to each other without losing information. These instructions are all +// pseudo instructions and use the "_Fp" suffix. +// In some cases there are additional variants with a mixture of different +// register sizes. +// The second instruction is defined with FPI, which is the actual instruction +// emitted by the assembler. These use "RST" registers, although frequently +// the actual register(s) used are implicit. These are always 80 bits. +// The FP stackifier pass converts one to the other after register allocation +// occurs. +// +// Note that the FpI instruction should have instruction selection info (e.g. +// a pattern) and the FPI instruction should have emission info (e.g. opcode +// encoding and asm printing info). + +// FpIf32, FpIf64 - Floating Point Pseudo Instruction template. +// f32 instructions can use SSE1 and are predicated on FPStackf32 == !SSE1. +// f64 instructions can use SSE2 and are predicated on FPStackf64 == !SSE2. +// f80 instructions cannot use SSE and use neither of these. +class FpIf32<dag outs, dag ins, FPFormat fp, list<dag> pattern> : + FpI_<outs, ins, fp, pattern>, Requires<[FPStackf32]>; +class FpIf64<dag outs, dag ins, FPFormat fp, list<dag> pattern> : + FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64]>; + +// Factoring for arithmetic. +multiclass FPBinary_rr<SDNode OpNode> { +// Register op register -> register +// These are separated out because they have no reversed form. +def _Fp32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2), TwoArgFP, + [(set RFP32:$dst, (OpNode RFP32:$src1, RFP32:$src2))]>; +def _Fp64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2), TwoArgFP, + [(set RFP64:$dst, (OpNode RFP64:$src1, RFP64:$src2))]>; +def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), TwoArgFP, + [(set RFP80:$dst, (OpNode RFP80:$src1, RFP80:$src2))]>; +} +// The FopST0 series are not included here because of the irregularities +// in where the 'r' goes in assembly output. +// These instructions cannot address 80-bit memory. +multiclass FPBinary<SDNode OpNode, Format fp, string asmstring, + bit Forward = 1> { +let mayLoad = 1, hasSideEffects = 1 in { +// ST(0) = ST(0) + [mem] +def _Fp32m : FpIf32<(outs RFP32:$dst), + (ins RFP32:$src1, f32mem:$src2), OneArgFPRW, + [!if(Forward, + (set RFP32:$dst, + (OpNode RFP32:$src1, (loadf32 addr:$src2))), + (set RFP32:$dst, + (OpNode (loadf32 addr:$src2), RFP32:$src1)))]>; +def _Fp64m : FpIf64<(outs RFP64:$dst), + (ins RFP64:$src1, f64mem:$src2), OneArgFPRW, + [!if(Forward, + (set RFP64:$dst, + (OpNode RFP64:$src1, (loadf64 addr:$src2))), + (set RFP64:$dst, + (OpNode (loadf64 addr:$src2), RFP64:$src1)))]>; +def _Fp64m32: FpIf64<(outs RFP64:$dst), + (ins RFP64:$src1, f32mem:$src2), OneArgFPRW, + [!if(Forward, + (set RFP64:$dst, + (OpNode RFP64:$src1, (f64 (extloadf32 addr:$src2)))), + (set RFP64:$dst, + (OpNode (f64 (extloadf32 addr:$src2)), RFP64:$src1)))]>; +def _Fp80m32: FpI_<(outs RFP80:$dst), + (ins RFP80:$src1, f32mem:$src2), OneArgFPRW, + [!if(Forward, + (set RFP80:$dst, + (OpNode RFP80:$src1, (f80 (extloadf32 addr:$src2)))), + (set RFP80:$dst, + (OpNode (f80 (extloadf32 addr:$src2)), RFP80:$src1)))]>; +def _Fp80m64: FpI_<(outs RFP80:$dst), + (ins RFP80:$src1, f64mem:$src2), OneArgFPRW, + [!if(Forward, + (set RFP80:$dst, + (OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2)))), + (set RFP80:$dst, + (OpNode (f80 (extloadf64 addr:$src2)), RFP80:$src1)))]>; +def _F32m : FPI<0xD8, fp, (outs), (ins f32mem:$src), + !strconcat("f", asmstring, "{s}\t$src")>; +def _F64m : FPI<0xDC, fp, (outs), (ins f64mem:$src), + !strconcat("f", asmstring, "{l}\t$src")>; +// ST(0) = ST(0) + [memint] +def _FpI16m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i16mem:$src2), + OneArgFPRW, + [!if(Forward, + (set RFP32:$dst, + (OpNode RFP32:$src1, (X86fild addr:$src2, i16))), + (set RFP32:$dst, + (OpNode (X86fild addr:$src2, i16), RFP32:$src1)))]>; +def _FpI32m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i32mem:$src2), + OneArgFPRW, + [!if(Forward, + (set RFP32:$dst, + (OpNode RFP32:$src1, (X86fild addr:$src2, i32))), + (set RFP32:$dst, + (OpNode (X86fild addr:$src2, i32), RFP32:$src1)))]>; +def _FpI16m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i16mem:$src2), + OneArgFPRW, + [!if(Forward, + (set RFP64:$dst, + (OpNode RFP64:$src1, (X86fild addr:$src2, i16))), + (set RFP64:$dst, + (OpNode (X86fild addr:$src2, i16), RFP64:$src1)))]>; +def _FpI32m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i32mem:$src2), + OneArgFPRW, + [!if(Forward, + (set RFP64:$dst, + (OpNode RFP64:$src1, (X86fild addr:$src2, i32))), + (set RFP64:$dst, + (OpNode (X86fild addr:$src2, i32), RFP64:$src1)))]>; +def _FpI16m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i16mem:$src2), + OneArgFPRW, + [!if(Forward, + (set RFP80:$dst, + (OpNode RFP80:$src1, (X86fild addr:$src2, i16))), + (set RFP80:$dst, + (OpNode (X86fild addr:$src2, i16), RFP80:$src1)))]>; +def _FpI32m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i32mem:$src2), + OneArgFPRW, + [!if(Forward, + (set RFP80:$dst, + (OpNode RFP80:$src1, (X86fild addr:$src2, i32))), + (set RFP80:$dst, + (OpNode (X86fild addr:$src2, i32), RFP80:$src1)))]>; +def _FI16m : FPI<0xDE, fp, (outs), (ins i16mem:$src), + !strconcat("fi", asmstring, "{s}\t$src")>; +def _FI32m : FPI<0xDA, fp, (outs), (ins i32mem:$src), + !strconcat("fi", asmstring, "{l}\t$src")>; +} // mayLoad = 1, hasSideEffects = 1 +} + +let Defs = [FPSW] in { +// FPBinary_rr just defines pseudo-instructions, no need to set a scheduling +// resources. +let hasNoSchedulingInfo = 1 in { +defm ADD : FPBinary_rr<fadd>; +defm SUB : FPBinary_rr<fsub>; +defm MUL : FPBinary_rr<fmul>; +defm DIV : FPBinary_rr<fdiv>; +} + +// Sets the scheduling resources for the actual NAME#_F<size>m defintions. +let SchedRW = [WriteFAddLd] in { +defm ADD : FPBinary<fadd, MRM0m, "add">; +defm SUB : FPBinary<fsub, MRM4m, "sub">; +defm SUBR: FPBinary<fsub ,MRM5m, "subr", 0>; +} + +let SchedRW = [WriteFMulLd] in { +defm MUL : FPBinary<fmul, MRM1m, "mul">; +} + +let SchedRW = [WriteFDivLd] in { +defm DIV : FPBinary<fdiv, MRM6m, "div">; +defm DIVR: FPBinary<fdiv, MRM7m, "divr", 0>; +} +} // Defs = [FPSW] + +class FPST0rInst<Format fp, string asm> + : FPI<0xD8, fp, (outs), (ins RST:$op), asm>; +class FPrST0Inst<Format fp, string asm> + : FPI<0xDC, fp, (outs), (ins RST:$op), asm>; +class FPrST0PInst<Format fp, string asm> + : FPI<0xDE, fp, (outs), (ins RST:$op), asm>; + +// NOTE: GAS and apparently all other AT&T style assemblers have a broken notion +// of some of the 'reverse' forms of the fsub and fdiv instructions. As such, +// we have to put some 'r's in and take them out of weird places. +let SchedRW = [WriteFAdd] in { +def ADD_FST0r : FPST0rInst <MRM0r, "fadd\t$op">; +def ADD_FrST0 : FPrST0Inst <MRM0r, "fadd\t{%st(0), $op|$op, st(0)}">; +def ADD_FPrST0 : FPrST0PInst<MRM0r, "faddp\t$op">; +def SUBR_FST0r : FPST0rInst <MRM5r, "fsubr\t$op">; +def SUB_FrST0 : FPrST0Inst <MRM5r, "fsub{r}\t{%st(0), $op|$op, st(0)}">; +def SUB_FPrST0 : FPrST0PInst<MRM5r, "fsub{r}p\t$op">; +def SUB_FST0r : FPST0rInst <MRM4r, "fsub\t$op">; +def SUBR_FrST0 : FPrST0Inst <MRM4r, "fsub{|r}\t{%st(0), $op|$op, st(0)}">; +def SUBR_FPrST0 : FPrST0PInst<MRM4r, "fsub{|r}p\t$op">; +} // SchedRW +let SchedRW = [WriteFCom] in { +def COM_FST0r : FPST0rInst <MRM2r, "fcom\t$op">; +def COMP_FST0r : FPST0rInst <MRM3r, "fcomp\t$op">; +} // SchedRW +let SchedRW = [WriteFMul] in { +def MUL_FST0r : FPST0rInst <MRM1r, "fmul\t$op">; +def MUL_FrST0 : FPrST0Inst <MRM1r, "fmul\t{%st(0), $op|$op, st(0)}">; +def MUL_FPrST0 : FPrST0PInst<MRM1r, "fmulp\t$op">; +} // SchedRW +let SchedRW = [WriteFDiv] in { +def DIVR_FST0r : FPST0rInst <MRM7r, "fdivr\t$op">; +def DIV_FrST0 : FPrST0Inst <MRM7r, "fdiv{r}\t{%st(0), $op|$op, st(0)}">; +def DIV_FPrST0 : FPrST0PInst<MRM7r, "fdiv{r}p\t$op">; +def DIV_FST0r : FPST0rInst <MRM6r, "fdiv\t$op">; +def DIVR_FrST0 : FPrST0Inst <MRM6r, "fdiv{|r}\t{%st(0), $op|$op, st(0)}">; +def DIVR_FPrST0 : FPrST0PInst<MRM6r, "fdiv{|r}p\t$op">; +} // SchedRW + +// Unary operations. +multiclass FPUnary<SDNode OpNode, Format fp, string asmstring> { +def _Fp32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src), OneArgFPRW, + [(set RFP32:$dst, (OpNode RFP32:$src))]>; +def _Fp64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src), OneArgFPRW, + [(set RFP64:$dst, (OpNode RFP64:$src))]>; +def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src), OneArgFPRW, + [(set RFP80:$dst, (OpNode RFP80:$src))]>; +def _F : FPI<0xD9, fp, (outs), (ins), asmstring>; +} + +let Defs = [FPSW] in { + +let SchedRW = [WriteFSign] in { +defm CHS : FPUnary<fneg, MRM_E0, "fchs">; +defm ABS : FPUnary<fabs, MRM_E1, "fabs">; +} + +let SchedRW = [WriteFSqrt80] in +defm SQRT: FPUnary<fsqrt,MRM_FA, "fsqrt">; + +let SchedRW = [WriteMicrocoded] in { +defm SIN : FPUnary<fsin, MRM_FE, "fsin">; +defm COS : FPUnary<fcos, MRM_FF, "fcos">; +} + +let SchedRW = [WriteFCom] in { +let hasSideEffects = 0 in { +def TST_Fp32 : FpIf32<(outs), (ins RFP32:$src), OneArgFP, []>; +def TST_Fp64 : FpIf64<(outs), (ins RFP64:$src), OneArgFP, []>; +def TST_Fp80 : FpI_<(outs), (ins RFP80:$src), OneArgFP, []>; +} // hasSideEffects + +def TST_F : FPI<0xD9, MRM_E4, (outs), (ins), "ftst">; +} // SchedRW +} // Defs = [FPSW] + +// Versions of FP instructions that take a single memory operand. Added for the +// disassembler; remove as they are included with patterns elsewhere. +let SchedRW = [WriteFComLd] in { +def FCOM32m : FPI<0xD8, MRM2m, (outs), (ins f32mem:$src), "fcom{s}\t$src">; +def FCOMP32m : FPI<0xD8, MRM3m, (outs), (ins f32mem:$src), "fcomp{s}\t$src">; + +def FCOM64m : FPI<0xDC, MRM2m, (outs), (ins f64mem:$src), "fcom{l}\t$src">; +def FCOMP64m : FPI<0xDC, MRM3m, (outs), (ins f64mem:$src), "fcomp{l}\t$src">; + +def FICOM16m : FPI<0xDE, MRM2m, (outs), (ins i16mem:$src), "ficom{s}\t$src">; +def FICOMP16m: FPI<0xDE, MRM3m, (outs), (ins i16mem:$src), "ficomp{s}\t$src">; + +def FICOM32m : FPI<0xDA, MRM2m, (outs), (ins i32mem:$src), "ficom{l}\t$src">; +def FICOMP32m: FPI<0xDA, MRM3m, (outs), (ins i32mem:$src), "ficomp{l}\t$src">; +} // SchedRW + +let SchedRW = [WriteMicrocoded] in { +def FLDENVm : FPI<0xD9, MRM4m, (outs), (ins f32mem:$src), "fldenv\t$src">; +def FSTENVm : FPI<0xD9, MRM6m, (outs), (ins f32mem:$dst), "fnstenv\t$dst">; + +def FRSTORm : FPI<0xDD, MRM4m, (outs), (ins f32mem:$dst), "frstor\t$dst">; +def FSAVEm : FPI<0xDD, MRM6m, (outs), (ins f32mem:$dst), "fnsave\t$dst">; +def FNSTSWm : FPI<0xDD, MRM7m, (outs), (ins i16mem:$dst), "fnstsw\t$dst">; + +def FBLDm : FPI<0xDF, MRM4m, (outs), (ins f80mem:$src), "fbld\ttbyte ptr $src">; +def FBSTPm : FPI<0xDF, MRM6m, (outs), (ins f80mem:$dst), "fbstp\ttbyte ptr $dst">; +} // SchedRW + +// Floating point cmovs. +class FpIf32CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> : + FpI_<outs, ins, fp, pattern>, Requires<[FPStackf32, HasCMov]>; +class FpIf64CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> : + FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64, HasCMov]>; + +multiclass FPCMov<PatLeaf cc> { + def _Fp32 : FpIf32CMov<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2), + CondMovFP, + [(set RFP32:$dst, (X86cmov RFP32:$src1, RFP32:$src2, + cc, EFLAGS))]>; + def _Fp64 : FpIf64CMov<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2), + CondMovFP, + [(set RFP64:$dst, (X86cmov RFP64:$src1, RFP64:$src2, + cc, EFLAGS))]>; + def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), + CondMovFP, + [(set RFP80:$dst, (X86cmov RFP80:$src1, RFP80:$src2, + cc, EFLAGS))]>, + Requires<[HasCMov]>; +} + +let Defs = [FPSW] in { +let SchedRW = [WriteFCMOV] in { +let Uses = [EFLAGS], Constraints = "$src1 = $dst" in { +defm CMOVB : FPCMov<X86_COND_B>; +defm CMOVBE : FPCMov<X86_COND_BE>; +defm CMOVE : FPCMov<X86_COND_E>; +defm CMOVP : FPCMov<X86_COND_P>; +defm CMOVNB : FPCMov<X86_COND_AE>; +defm CMOVNBE: FPCMov<X86_COND_A>; +defm CMOVNE : FPCMov<X86_COND_NE>; +defm CMOVNP : FPCMov<X86_COND_NP>; +} // Uses = [EFLAGS], Constraints = "$src1 = $dst" + +let Predicates = [HasCMov] in { +// These are not factored because there's no clean way to pass DA/DB. +def CMOVB_F : FPI<0xDA, MRM0r, (outs), (ins RST:$op), + "fcmovb\t{$op, %st(0)|st(0), $op}">; +def CMOVBE_F : FPI<0xDA, MRM2r, (outs), (ins RST:$op), + "fcmovbe\t{$op, %st(0)|st(0), $op}">; +def CMOVE_F : FPI<0xDA, MRM1r, (outs), (ins RST:$op), + "fcmove\t{$op, %st(0)|st(0), $op}">; +def CMOVP_F : FPI<0xDA, MRM3r, (outs), (ins RST:$op), + "fcmovu\t{$op, %st(0)|st(0), $op}">; +def CMOVNB_F : FPI<0xDB, MRM0r, (outs), (ins RST:$op), + "fcmovnb\t{$op, %st(0)|st(0), $op}">; +def CMOVNBE_F: FPI<0xDB, MRM2r, (outs), (ins RST:$op), + "fcmovnbe\t{$op, %st(0)|st(0), $op}">; +def CMOVNE_F : FPI<0xDB, MRM1r, (outs), (ins RST:$op), + "fcmovne\t{$op, %st(0)|st(0), $op}">; +def CMOVNP_F : FPI<0xDB, MRM3r, (outs), (ins RST:$op), + "fcmovnu\t{$op, %st(0)|st(0), $op}">; +} // Predicates = [HasCMov] +} // SchedRW + +// Floating point loads & stores. +let SchedRW = [WriteLoad] in { +let canFoldAsLoad = 1 in { +def LD_Fp32m : FpIf32<(outs RFP32:$dst), (ins f32mem:$src), ZeroArgFP, + [(set RFP32:$dst, (loadf32 addr:$src))]>; +let isReMaterializable = 1 in + def LD_Fp64m : FpIf64<(outs RFP64:$dst), (ins f64mem:$src), ZeroArgFP, + [(set RFP64:$dst, (loadf64 addr:$src))]>; +def LD_Fp80m : FpI_<(outs RFP80:$dst), (ins f80mem:$src), ZeroArgFP, + [(set RFP80:$dst, (loadf80 addr:$src))]>; +} // canFoldAsLoad +def LD_Fp32m64 : FpIf64<(outs RFP64:$dst), (ins f32mem:$src), ZeroArgFP, + [(set RFP64:$dst, (f64 (extloadf32 addr:$src)))]>; +def LD_Fp64m80 : FpI_<(outs RFP80:$dst), (ins f64mem:$src), ZeroArgFP, + [(set RFP80:$dst, (f80 (extloadf64 addr:$src)))]>; +def LD_Fp32m80 : FpI_<(outs RFP80:$dst), (ins f32mem:$src), ZeroArgFP, + [(set RFP80:$dst, (f80 (extloadf32 addr:$src)))]>; +def ILD_Fp16m32: FpIf32<(outs RFP32:$dst), (ins i16mem:$src), ZeroArgFP, + [(set RFP32:$dst, (X86fild addr:$src, i16))]>; +def ILD_Fp32m32: FpIf32<(outs RFP32:$dst), (ins i32mem:$src), ZeroArgFP, + [(set RFP32:$dst, (X86fild addr:$src, i32))]>; +def ILD_Fp64m32: FpIf32<(outs RFP32:$dst), (ins i64mem:$src), ZeroArgFP, + [(set RFP32:$dst, (X86fild addr:$src, i64))]>; +def ILD_Fp16m64: FpIf64<(outs RFP64:$dst), (ins i16mem:$src), ZeroArgFP, + [(set RFP64:$dst, (X86fild addr:$src, i16))]>; +def ILD_Fp32m64: FpIf64<(outs RFP64:$dst), (ins i32mem:$src), ZeroArgFP, + [(set RFP64:$dst, (X86fild addr:$src, i32))]>; +def ILD_Fp64m64: FpIf64<(outs RFP64:$dst), (ins i64mem:$src), ZeroArgFP, + [(set RFP64:$dst, (X86fild addr:$src, i64))]>; +def ILD_Fp16m80: FpI_<(outs RFP80:$dst), (ins i16mem:$src), ZeroArgFP, + [(set RFP80:$dst, (X86fild addr:$src, i16))]>; +def ILD_Fp32m80: FpI_<(outs RFP80:$dst), (ins i32mem:$src), ZeroArgFP, + [(set RFP80:$dst, (X86fild addr:$src, i32))]>; +def ILD_Fp64m80: FpI_<(outs RFP80:$dst), (ins i64mem:$src), ZeroArgFP, + [(set RFP80:$dst, (X86fild addr:$src, i64))]>; +} // SchedRW + +let SchedRW = [WriteStore] in { +def ST_Fp32m : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP, + [(store RFP32:$src, addr:$op)]>; +def ST_Fp64m32 : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP, + [(truncstoref32 RFP64:$src, addr:$op)]>; +def ST_Fp64m : FpIf64<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP, + [(store RFP64:$src, addr:$op)]>; +def ST_Fp80m32 : FpI_<(outs), (ins f32mem:$op, RFP80:$src), OneArgFP, + [(truncstoref32 RFP80:$src, addr:$op)]>; +def ST_Fp80m64 : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP, + [(truncstoref64 RFP80:$src, addr:$op)]>; +// FST does not support 80-bit memory target; FSTP must be used. + +let mayStore = 1, hasSideEffects = 0 in { +def ST_FpP32m : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP, []>; +def ST_FpP64m32 : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP, []>; +def ST_FpP64m : FpIf64<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP, []>; +def ST_FpP80m32 : FpI_<(outs), (ins f32mem:$op, RFP80:$src), OneArgFP, []>; +def ST_FpP80m64 : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP, []>; +} // mayStore + +def ST_FpP80m : FpI_<(outs), (ins f80mem:$op, RFP80:$src), OneArgFP, + [(store RFP80:$src, addr:$op)]>; + +let mayStore = 1, hasSideEffects = 0 in { +def IST_Fp16m32 : FpIf32<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP, []>; +def IST_Fp32m32 : FpIf32<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP, []>; +def IST_Fp64m32 : FpIf32<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP, []>; +def IST_Fp16m64 : FpIf64<(outs), (ins i16mem:$op, RFP64:$src), OneArgFP, []>; +def IST_Fp32m64 : FpIf64<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP, []>; +def IST_Fp64m64 : FpIf64<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP, []>; +def IST_Fp16m80 : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP, []>; +def IST_Fp32m80 : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP, []>; +def IST_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, []>; +} // mayStore +} // SchedRW + +let mayLoad = 1, SchedRW = [WriteLoad] in { +def LD_F32m : FPI<0xD9, MRM0m, (outs), (ins f32mem:$src), "fld{s}\t$src">; +def LD_F64m : FPI<0xDD, MRM0m, (outs), (ins f64mem:$src), "fld{l}\t$src">; +def LD_F80m : FPI<0xDB, MRM5m, (outs), (ins f80mem:$src), "fld{t}\t$src">; +def ILD_F16m : FPI<0xDF, MRM0m, (outs), (ins i16mem:$src), "fild{s}\t$src">; +def ILD_F32m : FPI<0xDB, MRM0m, (outs), (ins i32mem:$src), "fild{l}\t$src">; +def ILD_F64m : FPI<0xDF, MRM5m, (outs), (ins i64mem:$src), "fild{ll}\t$src">; +} +let mayStore = 1, SchedRW = [WriteStore] in { +def ST_F32m : FPI<0xD9, MRM2m, (outs), (ins f32mem:$dst), "fst{s}\t$dst">; +def ST_F64m : FPI<0xDD, MRM2m, (outs), (ins f64mem:$dst), "fst{l}\t$dst">; +def ST_FP32m : FPI<0xD9, MRM3m, (outs), (ins f32mem:$dst), "fstp{s}\t$dst">; +def ST_FP64m : FPI<0xDD, MRM3m, (outs), (ins f64mem:$dst), "fstp{l}\t$dst">; +def ST_FP80m : FPI<0xDB, MRM7m, (outs), (ins f80mem:$dst), "fstp{t}\t$dst">; +def IST_F16m : FPI<0xDF, MRM2m, (outs), (ins i16mem:$dst), "fist{s}\t$dst">; +def IST_F32m : FPI<0xDB, MRM2m, (outs), (ins i32mem:$dst), "fist{l}\t$dst">; +def IST_FP16m : FPI<0xDF, MRM3m, (outs), (ins i16mem:$dst), "fistp{s}\t$dst">; +def IST_FP32m : FPI<0xDB, MRM3m, (outs), (ins i32mem:$dst), "fistp{l}\t$dst">; +def IST_FP64m : FPI<0xDF, MRM7m, (outs), (ins i64mem:$dst), "fistp{ll}\t$dst">; +} + +// FISTTP requires SSE3 even though it's a FPStack op. +let Predicates = [HasSSE3], SchedRW = [WriteStore] in { +def ISTT_Fp16m32 : FpI_<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP, + [(X86fp_to_i16mem RFP32:$src, addr:$op)]>; +def ISTT_Fp32m32 : FpI_<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP, + [(X86fp_to_i32mem RFP32:$src, addr:$op)]>; +def ISTT_Fp64m32 : FpI_<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP, + [(X86fp_to_i64mem RFP32:$src, addr:$op)]>; +def ISTT_Fp16m64 : FpI_<(outs), (ins i16mem:$op, RFP64:$src), OneArgFP, + [(X86fp_to_i16mem RFP64:$src, addr:$op)]>; +def ISTT_Fp32m64 : FpI_<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP, + [(X86fp_to_i32mem RFP64:$src, addr:$op)]>; +def ISTT_Fp64m64 : FpI_<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP, + [(X86fp_to_i64mem RFP64:$src, addr:$op)]>; +def ISTT_Fp16m80 : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP, + [(X86fp_to_i16mem RFP80:$src, addr:$op)]>; +def ISTT_Fp32m80 : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP, + [(X86fp_to_i32mem RFP80:$src, addr:$op)]>; +def ISTT_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, + [(X86fp_to_i64mem RFP80:$src, addr:$op)]>; +} // Predicates = [HasSSE3] + +let mayStore = 1, SchedRW = [WriteStore] in { +def ISTT_FP16m : FPI<0xDF, MRM1m, (outs), (ins i16mem:$dst), "fisttp{s}\t$dst">; +def ISTT_FP32m : FPI<0xDB, MRM1m, (outs), (ins i32mem:$dst), "fisttp{l}\t$dst">; +def ISTT_FP64m : FPI<0xDD, MRM1m, (outs), (ins i64mem:$dst), "fisttp{ll}\t$dst">; +} + +// FP Stack manipulation instructions. +let SchedRW = [WriteMove] in { +def LD_Frr : FPI<0xD9, MRM0r, (outs), (ins RST:$op), "fld\t$op">; +def ST_Frr : FPI<0xDD, MRM2r, (outs), (ins RST:$op), "fst\t$op">; +def ST_FPrr : FPI<0xDD, MRM3r, (outs), (ins RST:$op), "fstp\t$op">; +def XCH_F : FPI<0xD9, MRM1r, (outs), (ins RST:$op), "fxch\t$op">; +} + +// Floating point constant loads. +let isReMaterializable = 1, SchedRW = [WriteZero] in { +def LD_Fp032 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP, + [(set RFP32:$dst, fpimm0)]>; +def LD_Fp132 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP, + [(set RFP32:$dst, fpimm1)]>; +def LD_Fp064 : FpIf64<(outs RFP64:$dst), (ins), ZeroArgFP, + [(set RFP64:$dst, fpimm0)]>; +def LD_Fp164 : FpIf64<(outs RFP64:$dst), (ins), ZeroArgFP, + [(set RFP64:$dst, fpimm1)]>; +def LD_Fp080 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP, + [(set RFP80:$dst, fpimm0)]>; +def LD_Fp180 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP, + [(set RFP80:$dst, fpimm1)]>; +} + +let SchedRW = [WriteFLD0] in +def LD_F0 : FPI<0xD9, MRM_EE, (outs), (ins), "fldz">; + +let SchedRW = [WriteFLD1] in +def LD_F1 : FPI<0xD9, MRM_E8, (outs), (ins), "fld1">; + +let SchedRW = [WriteFLDC], Defs = [FPSW] in { +def FLDL2T : I<0xD9, MRM_E9, (outs), (ins), "fldl2t", []>; +def FLDL2E : I<0xD9, MRM_EA, (outs), (ins), "fldl2e", []>; +def FLDPI : I<0xD9, MRM_EB, (outs), (ins), "fldpi", []>; +def FLDLG2 : I<0xD9, MRM_EC, (outs), (ins), "fldlg2", []>; +def FLDLN2 : I<0xD9, MRM_ED, (outs), (ins), "fldln2", []>; +} // SchedRW + +// Floating point compares. +let SchedRW = [WriteFCom] in { +def UCOM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP, + [(set FPSW, (trunc (X86cmp RFP32:$lhs, RFP32:$rhs)))]>; +def UCOM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP, + [(set FPSW, (trunc (X86cmp RFP64:$lhs, RFP64:$rhs)))]>; +def UCOM_Fpr80 : FpI_ <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP, + [(set FPSW, (trunc (X86cmp RFP80:$lhs, RFP80:$rhs)))]>; +} // SchedRW +} // Defs = [FPSW] + +let SchedRW = [WriteFCom] in { +// CC = ST(0) cmp ST(i) +let Defs = [EFLAGS, FPSW] in { +def UCOM_FpIr32: FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP, + [(set EFLAGS, (X86cmp RFP32:$lhs, RFP32:$rhs))]>; +def UCOM_FpIr64: FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP, + [(set EFLAGS, (X86cmp RFP64:$lhs, RFP64:$rhs))]>; +def UCOM_FpIr80: FpI_<(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP, + [(set EFLAGS, (X86cmp RFP80:$lhs, RFP80:$rhs))]>; +} + +let Defs = [FPSW], Uses = [ST0] in { +def UCOM_Fr : FPI<0xDD, MRM4r, // FPSW = cmp ST(0) with ST(i) + (outs), (ins RST:$reg), "fucom\t$reg">; +def UCOM_FPr : FPI<0xDD, MRM5r, // FPSW = cmp ST(0) with ST(i), pop + (outs), (ins RST:$reg), "fucomp\t$reg">; +def UCOM_FPPr : FPI<0xDA, MRM_E9, // cmp ST(0) with ST(1), pop, pop + (outs), (ins), "fucompp">; +} + +let Defs = [EFLAGS, FPSW], Uses = [ST0] in { +def UCOM_FIr : FPI<0xDB, MRM5r, // CC = cmp ST(0) with ST(i) + (outs), (ins RST:$reg), "fucomi\t$reg">; +def UCOM_FIPr : FPI<0xDF, MRM5r, // CC = cmp ST(0) with ST(i), pop + (outs), (ins RST:$reg), "fucompi\t$reg">; +} + +let Defs = [EFLAGS, FPSW] in { +def COM_FIr : FPI<0xDB, MRM6r, (outs), (ins RST:$reg), "fcomi\t$reg">; +def COM_FIPr : FPI<0xDF, MRM6r, (outs), (ins RST:$reg), "fcompi\t$reg">; +} +} // SchedRW + +// Floating point flag ops. +let SchedRW = [WriteALU] in { +let Defs = [AX], Uses = [FPSW] in +def FNSTSW16r : I<0xDF, MRM_E0, // AX = fp flags + (outs), (ins), "fnstsw\t{%ax|ax}", + [(set AX, (X86fp_stsw FPSW))]>; +let Defs = [FPSW] in +def FNSTCW16m : I<0xD9, MRM7m, // [mem16] = X87 control world + (outs), (ins i16mem:$dst), "fnstcw\t$dst", + [(X86fp_cwd_get16 addr:$dst)]>; +} // SchedRW +let Defs = [FPSW], mayLoad = 1 in +def FLDCW16m : I<0xD9, MRM5m, // X87 control world = [mem16] + (outs), (ins i16mem:$dst), "fldcw\t$dst", []>, + Sched<[WriteLoad]>; + +// FPU control instructions +let SchedRW = [WriteMicrocoded] in { +let Defs = [FPSW] in { +def FNINIT : I<0xDB, MRM_E3, (outs), (ins), "fninit", []>; +def FFREE : FPI<0xDD, MRM0r, (outs), (ins RST:$reg), "ffree\t$reg">; +def FFREEP : FPI<0xDF, MRM0r, (outs), (ins RST:$reg), "ffreep\t$reg">; + +def FPNCEST0r : FPI<0xD9, MRM3r, (outs RST:$op), (ins), + "fstpnce\t{%st(0), $op|$op, st(0)}">; + +def FENI8087_NOP : I<0xDB, MRM_E0, (outs), (ins), "feni8087_nop", []>; + +def FDISI8087_NOP : I<0xDB, MRM_E1, (outs), (ins), "fdisi8087_nop", []>; + +// Clear exceptions +def FNCLEX : I<0xDB, MRM_E2, (outs), (ins), "fnclex", []>; +} // Defs = [FPSW] +} // SchedRW + +// Operand-less floating-point instructions for the disassembler. +def FNOP : I<0xD9, MRM_D0, (outs), (ins), "fnop", []>, Sched<[WriteNop]>; + +let SchedRW = [WriteMicrocoded] in { +let Defs = [FPSW] in { +def WAIT : I<0x9B, RawFrm, (outs), (ins), "wait", []>; +def FXAM : I<0xD9, MRM_E5, (outs), (ins), "fxam", []>; +def F2XM1 : I<0xD9, MRM_F0, (outs), (ins), "f2xm1", []>; +def FYL2X : I<0xD9, MRM_F1, (outs), (ins), "fyl2x", []>; +def FPTAN : I<0xD9, MRM_F2, (outs), (ins), "fptan", []>; +def FPATAN : I<0xD9, MRM_F3, (outs), (ins), "fpatan", []>; +def FXTRACT : I<0xD9, MRM_F4, (outs), (ins), "fxtract", []>; +def FPREM1 : I<0xD9, MRM_F5, (outs), (ins), "fprem1", []>; +def FDECSTP : I<0xD9, MRM_F6, (outs), (ins), "fdecstp", []>; +def FINCSTP : I<0xD9, MRM_F7, (outs), (ins), "fincstp", []>; +def FPREM : I<0xD9, MRM_F8, (outs), (ins), "fprem", []>; +def FYL2XP1 : I<0xD9, MRM_F9, (outs), (ins), "fyl2xp1", []>; +def FSINCOS : I<0xD9, MRM_FB, (outs), (ins), "fsincos", []>; +def FRNDINT : I<0xD9, MRM_FC, (outs), (ins), "frndint", []>; +def FSCALE : I<0xD9, MRM_FD, (outs), (ins), "fscale", []>; +def FCOMPP : I<0xDE, MRM_D9, (outs), (ins), "fcompp", []>; +} // Defs = [FPSW] + +def FXSAVE : I<0xAE, MRM0m, (outs), (ins opaquemem:$dst), + "fxsave\t$dst", [(int_x86_fxsave addr:$dst)]>, TB, + Requires<[HasFXSR]>; +def FXSAVE64 : RI<0xAE, MRM0m, (outs), (ins opaquemem:$dst), + "fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)]>, + TB, Requires<[HasFXSR, In64BitMode]>; +def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaquemem:$src), + "fxrstor\t$src", [(int_x86_fxrstor addr:$src)]>, + TB, Requires<[HasFXSR]>; +def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaquemem:$src), + "fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)]>, + TB, Requires<[HasFXSR, In64BitMode]>; +} // SchedRW + +//===----------------------------------------------------------------------===// +// Non-Instruction Patterns +//===----------------------------------------------------------------------===// + +// Required for RET of f32 / f64 / f80 values. +def : Pat<(X86fld addr:$src, f32), (LD_Fp32m addr:$src)>; +def : Pat<(X86fld addr:$src, f64), (LD_Fp64m addr:$src)>; +def : Pat<(X86fld addr:$src, f80), (LD_Fp80m addr:$src)>; + +// Required for CALL which return f32 / f64 / f80 values. +def : Pat<(X86fst RFP32:$src, addr:$op, f32), (ST_Fp32m addr:$op, RFP32:$src)>; +def : Pat<(X86fst RFP64:$src, addr:$op, f32), (ST_Fp64m32 addr:$op, + RFP64:$src)>; +def : Pat<(X86fst RFP64:$src, addr:$op, f64), (ST_Fp64m addr:$op, RFP64:$src)>; +def : Pat<(X86fst RFP80:$src, addr:$op, f32), (ST_Fp80m32 addr:$op, + RFP80:$src)>; +def : Pat<(X86fst RFP80:$src, addr:$op, f64), (ST_Fp80m64 addr:$op, + RFP80:$src)>; +def : Pat<(X86fst RFP80:$src, addr:$op, f80), (ST_FpP80m addr:$op, + RFP80:$src)>; + +// Floating point constant -0.0 and -1.0 +def : Pat<(f32 fpimmneg0), (CHS_Fp32 (LD_Fp032))>, Requires<[FPStackf32]>; +def : Pat<(f32 fpimmneg1), (CHS_Fp32 (LD_Fp132))>, Requires<[FPStackf32]>; +def : Pat<(f64 fpimmneg0), (CHS_Fp64 (LD_Fp064))>, Requires<[FPStackf64]>; +def : Pat<(f64 fpimmneg1), (CHS_Fp64 (LD_Fp164))>, Requires<[FPStackf64]>; +def : Pat<(f80 fpimmneg0), (CHS_Fp80 (LD_Fp080))>; +def : Pat<(f80 fpimmneg1), (CHS_Fp80 (LD_Fp180))>; + +// Used to conv. i64 to f64 since there isn't a SSE version. +def : Pat<(X86fildflag addr:$src, i64), (ILD_Fp64m64 addr:$src)>; + +// FP extensions map onto simple pseudo-value conversions if they are to/from +// the FP stack. +def : Pat<(f64 (fpextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP64)>, + Requires<[FPStackf32]>; +def : Pat<(f80 (fpextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP80)>, + Requires<[FPStackf32]>; +def : Pat<(f80 (fpextend RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP80)>, + Requires<[FPStackf64]>; + +// FP truncations map onto simple pseudo-value conversions if they are to/from +// the FP stack. We have validated that only value-preserving truncations make +// it through isel. +def : Pat<(f32 (fpround RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP32)>, + Requires<[FPStackf32]>; +def : Pat<(f32 (fpround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP32)>, + Requires<[FPStackf32]>; +def : Pat<(f64 (fpround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP64)>, + Requires<[FPStackf64]>; diff --git a/capstone/suite/synctools/tablegen/X86/back/X86InstrFormats.td b/capstone/suite/synctools/tablegen/X86/back/X86InstrFormats.td new file mode 100644 index 000000000..47d4719d3 --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86InstrFormats.td @@ -0,0 +1,993 @@ +//===-- X86InstrFormats.td - X86 Instruction Formats -------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// X86 Instruction Format Definitions. +// + +// Format specifies the encoding used by the instruction. This is part of the +// ad-hoc solution used to emit machine instruction encodings by our machine +// code emitter. +class Format<bits<7> val> { + bits<7> Value = val; +} + +def Pseudo : Format<0>; +def RawFrm : Format<1>; +def AddRegFrm : Format<2>; +def RawFrmMemOffs : Format<3>; +def RawFrmSrc : Format<4>; +def RawFrmDst : Format<5>; +def RawFrmDstSrc : Format<6>; +def RawFrmImm8 : Format<7>; +def RawFrmImm16 : Format<8>; +def MRMDestMem : Format<32>; +def MRMSrcMem : Format<33>; +def MRMSrcMem4VOp3 : Format<34>; +def MRMSrcMemOp4 : Format<35>; +def MRMXm : Format<39>; +def MRM0m : Format<40>; def MRM1m : Format<41>; def MRM2m : Format<42>; +def MRM3m : Format<43>; def MRM4m : Format<44>; def MRM5m : Format<45>; +def MRM6m : Format<46>; def MRM7m : Format<47>; +def MRMDestReg : Format<48>; +def MRMSrcReg : Format<49>; +def MRMSrcReg4VOp3 : Format<50>; +def MRMSrcRegOp4 : Format<51>; +def MRMXr : Format<55>; +def MRM0r : Format<56>; def MRM1r : Format<57>; def MRM2r : Format<58>; +def MRM3r : Format<59>; def MRM4r : Format<60>; def MRM5r : Format<61>; +def MRM6r : Format<62>; def MRM7r : Format<63>; +def MRM_C0 : Format<64>; def MRM_C1 : Format<65>; def MRM_C2 : Format<66>; +def MRM_C3 : Format<67>; def MRM_C4 : Format<68>; def MRM_C5 : Format<69>; +def MRM_C6 : Format<70>; def MRM_C7 : Format<71>; def MRM_C8 : Format<72>; +def MRM_C9 : Format<73>; def MRM_CA : Format<74>; def MRM_CB : Format<75>; +def MRM_CC : Format<76>; def MRM_CD : Format<77>; def MRM_CE : Format<78>; +def MRM_CF : Format<79>; def MRM_D0 : Format<80>; def MRM_D1 : Format<81>; +def MRM_D2 : Format<82>; def MRM_D3 : Format<83>; def MRM_D4 : Format<84>; +def MRM_D5 : Format<85>; def MRM_D6 : Format<86>; def MRM_D7 : Format<87>; +def MRM_D8 : Format<88>; def MRM_D9 : Format<89>; def MRM_DA : Format<90>; +def MRM_DB : Format<91>; def MRM_DC : Format<92>; def MRM_DD : Format<93>; +def MRM_DE : Format<94>; def MRM_DF : Format<95>; def MRM_E0 : Format<96>; +def MRM_E1 : Format<97>; def MRM_E2 : Format<98>; def MRM_E3 : Format<99>; +def MRM_E4 : Format<100>; def MRM_E5 : Format<101>; def MRM_E6 : Format<102>; +def MRM_E7 : Format<103>; def MRM_E8 : Format<104>; def MRM_E9 : Format<105>; +def MRM_EA : Format<106>; def MRM_EB : Format<107>; def MRM_EC : Format<108>; +def MRM_ED : Format<109>; def MRM_EE : Format<110>; def MRM_EF : Format<111>; +def MRM_F0 : Format<112>; def MRM_F1 : Format<113>; def MRM_F2 : Format<114>; +def MRM_F3 : Format<115>; def MRM_F4 : Format<116>; def MRM_F5 : Format<117>; +def MRM_F6 : Format<118>; def MRM_F7 : Format<119>; def MRM_F8 : Format<120>; +def MRM_F9 : Format<121>; def MRM_FA : Format<122>; def MRM_FB : Format<123>; +def MRM_FC : Format<124>; def MRM_FD : Format<125>; def MRM_FE : Format<126>; +def MRM_FF : Format<127>; + +// ImmType - This specifies the immediate type used by an instruction. This is +// part of the ad-hoc solution used to emit machine instruction encodings by our +// machine code emitter. +class ImmType<bits<4> val> { + bits<4> Value = val; +} +def NoImm : ImmType<0>; +def Imm8 : ImmType<1>; +def Imm8PCRel : ImmType<2>; +def Imm8Reg : ImmType<3>; // Register encoded in [7:4]. +def Imm16 : ImmType<4>; +def Imm16PCRel : ImmType<5>; +def Imm32 : ImmType<6>; +def Imm32PCRel : ImmType<7>; +def Imm32S : ImmType<8>; +def Imm64 : ImmType<9>; + +// FPFormat - This specifies what form this FP instruction has. This is used by +// the Floating-Point stackifier pass. +class FPFormat<bits<3> val> { + bits<3> Value = val; +} +def NotFP : FPFormat<0>; +def ZeroArgFP : FPFormat<1>; +def OneArgFP : FPFormat<2>; +def OneArgFPRW : FPFormat<3>; +def TwoArgFP : FPFormat<4>; +def CompareFP : FPFormat<5>; +def CondMovFP : FPFormat<6>; +def SpecialFP : FPFormat<7>; + +// Class specifying the SSE execution domain, used by the SSEDomainFix pass. +// Keep in sync with tables in X86InstrInfo.cpp. +class Domain<bits<2> val> { + bits<2> Value = val; +} +def GenericDomain : Domain<0>; +def SSEPackedSingle : Domain<1>; +def SSEPackedDouble : Domain<2>; +def SSEPackedInt : Domain<3>; + +// Class specifying the vector form of the decompressed +// displacement of 8-bit. +class CD8VForm<bits<3> val> { + bits<3> Value = val; +} +def CD8VF : CD8VForm<0>; // v := VL +def CD8VH : CD8VForm<1>; // v := VL/2 +def CD8VQ : CD8VForm<2>; // v := VL/4 +def CD8VO : CD8VForm<3>; // v := VL/8 +// The tuple (subvector) forms. +def CD8VT1 : CD8VForm<4>; // v := 1 +def CD8VT2 : CD8VForm<5>; // v := 2 +def CD8VT4 : CD8VForm<6>; // v := 4 +def CD8VT8 : CD8VForm<7>; // v := 8 + +// Class specifying the prefix used an opcode extension. +class Prefix<bits<3> val> { + bits<3> Value = val; +} +def NoPrfx : Prefix<0>; +def PD : Prefix<1>; +def XS : Prefix<2>; +def XD : Prefix<3>; +def PS : Prefix<4>; // Similar to NoPrfx, but disassembler uses this to know + // that other instructions with this opcode use PD/XS/XD + // and if any of those is not supported they shouldn't + // decode to this instruction. e.g. ANDSS/ANDSD don't + // exist, but the 0xf2/0xf3 encoding shouldn't + // disable to ANDPS. + +// Class specifying the opcode map. +class Map<bits<3> val> { + bits<3> Value = val; +} +def OB : Map<0>; +def TB : Map<1>; +def T8 : Map<2>; +def TA : Map<3>; +def XOP8 : Map<4>; +def XOP9 : Map<5>; +def XOPA : Map<6>; +def ThreeDNow : Map<7>; + +// Class specifying the encoding +class Encoding<bits<2> val> { + bits<2> Value = val; +} +def EncNormal : Encoding<0>; +def EncVEX : Encoding<1>; +def EncXOP : Encoding<2>; +def EncEVEX : Encoding<3>; + +// Operand size for encodings that change based on mode. +class OperandSize<bits<2> val> { + bits<2> Value = val; +} +def OpSizeFixed : OperandSize<0>; // Never needs a 0x66 prefix. +def OpSize16 : OperandSize<1>; // Needs 0x66 prefix in 32-bit mode. +def OpSize32 : OperandSize<2>; // Needs 0x66 prefix in 16-bit mode. + +// Address size for encodings that change based on mode. +class AddressSize<bits<2> val> { + bits<2> Value = val; +} +def AdSizeX : AddressSize<0>; // Address size determined using addr operand. +def AdSize16 : AddressSize<1>; // Encodes a 16-bit address. +def AdSize32 : AddressSize<2>; // Encodes a 32-bit address. +def AdSize64 : AddressSize<3>; // Encodes a 64-bit address. + +// Prefix byte classes which are used to indicate to the ad-hoc machine code +// emitter that various prefix bytes are required. +class OpSize16 { OperandSize OpSize = OpSize16; } +class OpSize32 { OperandSize OpSize = OpSize32; } +class AdSize16 { AddressSize AdSize = AdSize16; } +class AdSize32 { AddressSize AdSize = AdSize32; } +class AdSize64 { AddressSize AdSize = AdSize64; } +class REX_W { bit hasREX_WPrefix = 1; } +class LOCK { bit hasLockPrefix = 1; } +class REP { bit hasREPPrefix = 1; } +class TB { Map OpMap = TB; } +class T8 { Map OpMap = T8; } +class TA { Map OpMap = TA; } +class XOP8 { Map OpMap = XOP8; Prefix OpPrefix = PS; } +class XOP9 { Map OpMap = XOP9; Prefix OpPrefix = PS; } +class XOPA { Map OpMap = XOPA; Prefix OpPrefix = PS; } +class ThreeDNow { Map OpMap = ThreeDNow; } +class OBXS { Prefix OpPrefix = XS; } +class PS : TB { Prefix OpPrefix = PS; } +class PD : TB { Prefix OpPrefix = PD; } +class XD : TB { Prefix OpPrefix = XD; } +class XS : TB { Prefix OpPrefix = XS; } +class T8PS : T8 { Prefix OpPrefix = PS; } +class T8PD : T8 { Prefix OpPrefix = PD; } +class T8XD : T8 { Prefix OpPrefix = XD; } +class T8XS : T8 { Prefix OpPrefix = XS; } +class TAPS : TA { Prefix OpPrefix = PS; } +class TAPD : TA { Prefix OpPrefix = PD; } +class TAXD : TA { Prefix OpPrefix = XD; } +class VEX { Encoding OpEnc = EncVEX; } +class VEX_W { bits<2> VEX_WPrefix = 1; } +class VEX_WIG { bits<2> VEX_WPrefix = 2; } +// Special version of VEX_W that can be changed to VEX.W==0 for EVEX2VEX. +// FIXME: We should consider adding separate bits for VEX_WIG and the extra +// part of W1X. This would probably simplify the tablegen emitters and +// the TSFlags creation below. +class VEX_W1X { bits<2> VEX_WPrefix = 3; } +class VEX_4V : VEX { bit hasVEX_4V = 1; } +class VEX_L { bit hasVEX_L = 1; } +class VEX_LIG { bit ignoresVEX_L = 1; } +class EVEX { Encoding OpEnc = EncEVEX; } +class EVEX_4V : EVEX { bit hasVEX_4V = 1; } +class EVEX_K { bit hasEVEX_K = 1; } +class EVEX_KZ : EVEX_K { bit hasEVEX_Z = 1; } +class EVEX_B { bit hasEVEX_B = 1; } +class EVEX_RC { bit hasEVEX_RC = 1; } +class EVEX_V512 { bit hasEVEX_L2 = 1; bit hasVEX_L = 0; } +class EVEX_V256 { bit hasEVEX_L2 = 0; bit hasVEX_L = 1; } +class EVEX_V128 { bit hasEVEX_L2 = 0; bit hasVEX_L = 0; } +class NOTRACK { bit hasNoTrackPrefix = 1; } + +// Specify AVX512 8-bit compressed displacement encoding based on the vector +// element size in bits (8, 16, 32, 64) and the CDisp8 form. +class EVEX_CD8<int esize, CD8VForm form> { + int CD8_EltSize = !srl(esize, 3); + bits<3> CD8_Form = form.Value; +} + +class XOP { Encoding OpEnc = EncXOP; } +class XOP_4V : XOP { bit hasVEX_4V = 1; } + +// Specify the alternative register form instruction to replace the current +// instruction in case it was picked during generation of memory folding tables +class FoldGenData<string _RegisterForm> { + string FoldGenRegForm = _RegisterForm; +} + +// Provide a specific instruction to be used by the EVEX2VEX conversion. +class EVEX2VEXOverride<string VEXInstrName> { + string EVEX2VEXOverride = VEXInstrName; +} + +// Mark the instruction as "illegal to memory fold/unfold" +class NotMemoryFoldable { bit isMemoryFoldable = 0; } + +// Prevent EVEX->VEX conversion from considering this instruction. +class NotEVEX2VEXConvertible { bit notEVEX2VEXConvertible = 1; } + +class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, + string AsmStr, Domain d = GenericDomain> + : Instruction { + let Namespace = "X86"; + + bits<8> Opcode = opcod; + Format Form = f; + bits<7> FormBits = Form.Value; + ImmType ImmT = i; + + dag OutOperandList = outs; + dag InOperandList = ins; + string AsmString = AsmStr; + + // If this is a pseudo instruction, mark it isCodeGenOnly. + let isCodeGenOnly = !eq(!cast<string>(f), "Pseudo"); + + // + // Attributes specific to X86 instructions... + // + bit ForceDisassemble = 0; // Force instruction to disassemble even though it's + // isCodeGenonly. Needed to hide an ambiguous + // AsmString from the parser, but still disassemble. + + OperandSize OpSize = OpSizeFixed; // Does this instruction's encoding change + // based on operand size of the mode? + bits<2> OpSizeBits = OpSize.Value; + AddressSize AdSize = AdSizeX; // Does this instruction's encoding change + // based on address size of the mode? + bits<2> AdSizeBits = AdSize.Value; + + Prefix OpPrefix = NoPrfx; // Which prefix byte does this inst have? + bits<3> OpPrefixBits = OpPrefix.Value; + Map OpMap = OB; // Which opcode map does this inst have? + bits<3> OpMapBits = OpMap.Value; + bit hasREX_WPrefix = 0; // Does this inst require the REX.W prefix? + FPFormat FPForm = NotFP; // What flavor of FP instruction is this? + bit hasLockPrefix = 0; // Does this inst have a 0xF0 prefix? + Domain ExeDomain = d; + bit hasREPPrefix = 0; // Does this inst have a REP prefix? + Encoding OpEnc = EncNormal; // Encoding used by this instruction + bits<2> OpEncBits = OpEnc.Value; + bits<2> VEX_WPrefix = 0; // Does this inst set the VEX_W field? + bit hasVEX_4V = 0; // Does this inst require the VEX.VVVV field? + bit hasVEX_L = 0; // Does this inst use large (256-bit) registers? + bit ignoresVEX_L = 0; // Does this instruction ignore the L-bit + bit hasEVEX_K = 0; // Does this inst require masking? + bit hasEVEX_Z = 0; // Does this inst set the EVEX_Z field? + bit hasEVEX_L2 = 0; // Does this inst set the EVEX_L2 field? + bit hasEVEX_B = 0; // Does this inst set the EVEX_B field? + bits<3> CD8_Form = 0; // Compressed disp8 form - vector-width. + // Declare it int rather than bits<4> so that all bits are defined when + // assigning to bits<7>. + int CD8_EltSize = 0; // Compressed disp8 form - element-size in bytes. + bit hasEVEX_RC = 0; // Explicitly specified rounding control in FP instruction. + bit hasNoTrackPrefix = 0; // Does this inst has 0x3E (NoTrack) prefix? + + bits<2> EVEX_LL; + let EVEX_LL{0} = hasVEX_L; + let EVEX_LL{1} = hasEVEX_L2; + // Vector size in bytes. + bits<7> VectSize = !shl(16, EVEX_LL); + + // The scaling factor for AVX512's compressed displacement is either + // - the size of a power-of-two number of elements or + // - the size of a single element for broadcasts or + // - the total vector size divided by a power-of-two number. + // Possible values are: 0 (non-AVX512 inst), 1, 2, 4, 8, 16, 32 and 64. + bits<7> CD8_Scale = !if (!eq (OpEnc.Value, EncEVEX.Value), + !if (CD8_Form{2}, + !shl(CD8_EltSize, CD8_Form{1-0}), + !if (hasEVEX_B, + CD8_EltSize, + !srl(VectSize, CD8_Form{1-0}))), 0); + + // Used in the memory folding generation (TableGen backend) to point to an alternative + // instruction to replace the current one in case it got picked during generation. + string FoldGenRegForm = ?; + + // Used to prevent an explicit EVEX2VEX override for this instruction. + string EVEX2VEXOverride = ?; + + bit isMemoryFoldable = 1; // Is it allowed to memory fold/unfold this instruction? + bit notEVEX2VEXConvertible = 0; // Prevent EVEX->VEX conversion. + + // TSFlags layout should be kept in sync with X86BaseInfo.h. + let TSFlags{6-0} = FormBits; + let TSFlags{8-7} = OpSizeBits; + let TSFlags{10-9} = AdSizeBits; + // No need for 3rd bit, we don't need to distinguish NoPrfx from PS. + let TSFlags{12-11} = OpPrefixBits{1-0}; + let TSFlags{15-13} = OpMapBits; + let TSFlags{16} = hasREX_WPrefix; + let TSFlags{20-17} = ImmT.Value; + let TSFlags{23-21} = FPForm.Value; + let TSFlags{24} = hasLockPrefix; + let TSFlags{25} = hasREPPrefix; + let TSFlags{27-26} = ExeDomain.Value; + let TSFlags{29-28} = OpEncBits; + let TSFlags{37-30} = Opcode; + // Currently no need for second bit in TSFlags - W Ignore is equivalent to 0. + let TSFlags{38} = VEX_WPrefix{0}; + let TSFlags{39} = hasVEX_4V; + let TSFlags{40} = hasVEX_L; + let TSFlags{41} = hasEVEX_K; + let TSFlags{42} = hasEVEX_Z; + let TSFlags{43} = hasEVEX_L2; + let TSFlags{44} = hasEVEX_B; + // If we run out of TSFlags bits, it's possible to encode this in 3 bits. + let TSFlags{51-45} = CD8_Scale; + let TSFlags{52} = hasEVEX_RC; + let TSFlags{53} = hasNoTrackPrefix; +} + +class PseudoI<dag oops, dag iops, list<dag> pattern> + : X86Inst<0, Pseudo, NoImm, oops, iops, ""> { + let Pattern = pattern; +} + +class I<bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern, Domain d = GenericDomain> + : X86Inst<o, f, NoImm, outs, ins, asm, d> { + let Pattern = pattern; + let CodeSize = 3; +} +class Ii8<bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern, Domain d = GenericDomain> + : X86Inst<o, f, Imm8, outs, ins, asm, d> { + let Pattern = pattern; + let CodeSize = 3; +} +class Ii8Reg<bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern, Domain d = GenericDomain> + : X86Inst<o, f, Imm8Reg, outs, ins, asm, d> { + let Pattern = pattern; + let CodeSize = 3; +} +class Ii8PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern> + : X86Inst<o, f, Imm8PCRel, outs, ins, asm> { + let Pattern = pattern; + let CodeSize = 3; +} +class Ii16<bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern> + : X86Inst<o, f, Imm16, outs, ins, asm> { + let Pattern = pattern; + let CodeSize = 3; +} +class Ii32<bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern> + : X86Inst<o, f, Imm32, outs, ins, asm> { + let Pattern = pattern; + let CodeSize = 3; +} +class Ii32S<bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern> + : X86Inst<o, f, Imm32S, outs, ins, asm> { + let Pattern = pattern; + let CodeSize = 3; +} + +class Ii64<bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern> + : X86Inst<o, f, Imm64, outs, ins, asm> { + let Pattern = pattern; + let CodeSize = 3; +} + +class Ii16PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern> + : X86Inst<o, f, Imm16PCRel, outs, ins, asm> { + let Pattern = pattern; + let CodeSize = 3; +} + +class Ii32PCRel<bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern> + : X86Inst<o, f, Imm32PCRel, outs, ins, asm> { + let Pattern = pattern; + let CodeSize = 3; +} + +// FPStack Instruction Templates: +// FPI - Floating Point Instruction template. +class FPI<bits<8> o, Format F, dag outs, dag ins, string asm> + : I<o, F, outs, ins, asm, []> {} + +// FpI_ - Floating Point Pseudo Instruction template. Not Predicated. +class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern> + : PseudoI<outs, ins, pattern> { + let FPForm = fp; +} + +// Templates for instructions that use a 16- or 32-bit segmented address as +// their only operand: lcall (FAR CALL) and ljmp (FAR JMP) +// +// Iseg16 - 16-bit segment selector, 16-bit offset +// Iseg32 - 16-bit segment selector, 32-bit offset + +class Iseg16 <bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern> + : X86Inst<o, f, Imm16, outs, ins, asm> { + let Pattern = pattern; + let CodeSize = 3; +} + +class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm, + list<dag> pattern> + : X86Inst<o, f, Imm32, outs, ins, asm> { + let Pattern = pattern; + let CodeSize = 3; +} + +// SI - SSE 1 & 2 scalar instructions +class SI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, Domain d = GenericDomain> + : I<o, F, outs, ins, asm, pattern, d> { + let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512], + !if(!eq(OpEnc.Value, EncVEX.Value), [UseAVX], + !if(!eq(OpPrefix.Value, XS.Value), [UseSSE1], + !if(!eq(OpPrefix.Value, XD.Value), [UseSSE2], + !if(!eq(OpPrefix.Value, PD.Value), [UseSSE2], + [UseSSE1]))))); + + // AVX instructions have a 'v' prefix in the mnemonic + let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm), + !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm), + asm)); +} + +// SI - SSE 1 & 2 scalar intrinsics - vex form available on AVX512 +class SI_Int<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, Domain d = GenericDomain> + : I<o, F, outs, ins, asm, pattern, d> { + let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512], + !if(!eq(OpEnc.Value, EncVEX.Value), [UseAVX], + !if(!eq(OpPrefix.Value, XS.Value), [UseSSE1], + !if(!eq(OpPrefix.Value, XD.Value), [UseSSE2], + !if(!eq(OpPrefix.Value, PD.Value), [UseSSE2], + [UseSSE1]))))); + + // AVX instructions have a 'v' prefix in the mnemonic + let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm), + !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm), + asm)); +} +// SIi8 - SSE 1 & 2 scalar instructions - vex form available on AVX512 +class SIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii8<o, F, outs, ins, asm, pattern> { + let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512], + !if(!eq(OpEnc.Value, EncVEX.Value), [HasAVX], + !if(!eq(OpPrefix.Value, XS.Value), [UseSSE1], + [UseSSE2]))); + + // AVX instructions have a 'v' prefix in the mnemonic + let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm), + !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm), + asm)); +} + +// PI - SSE 1 & 2 packed instructions +class PI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, + Domain d> + : I<o, F, outs, ins, asm, pattern, d> { + let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512], + !if(!eq(OpEnc.Value, EncVEX.Value), [HasAVX], + !if(!eq(OpPrefix.Value, PD.Value), [UseSSE2], + [UseSSE1]))); + + // AVX instructions have a 'v' prefix in the mnemonic + let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm), + !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm), + asm)); +} + +// MMXPI - SSE 1 & 2 packed instructions with MMX operands +class MMXPI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, + Domain d> + : I<o, F, outs, ins, asm, pattern, d> { + let Predicates = !if(!eq(OpPrefix.Value, PD.Value), [HasMMX, HasSSE2], + [HasMMX, HasSSE1]); +} + +// PIi8 - SSE 1 & 2 packed instructions with immediate +class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, Domain d> + : Ii8<o, F, outs, ins, asm, pattern, d> { + let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512], + !if(!eq(OpEnc.Value, EncVEX.Value), [HasAVX], + !if(!eq(OpPrefix.Value, PD.Value), [UseSSE2], + [UseSSE1]))); + + // AVX instructions have a 'v' prefix in the mnemonic + let AsmString = !if(!eq(OpEnc.Value, EncEVEX.Value), !strconcat("v", asm), + !if(!eq(OpEnc.Value, EncVEX.Value), !strconcat("v", asm), + asm)); +} + +// SSE1 Instruction Templates: +// +// SSI - SSE1 instructions with XS prefix. +// PSI - SSE1 instructions with PS prefix. +// PSIi8 - SSE1 instructions with ImmT == Imm8 and PS prefix. +// VSSI - SSE1 instructions with XS prefix in AVX form. +// VPSI - SSE1 instructions with PS prefix in AVX form, packed single. + +class SSI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern>, XS, Requires<[UseSSE1]>; +class SSIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[UseSSE1]>; +class PSI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, PS, + Requires<[UseSSE1]>; +class PSIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii8<o, F, outs, ins, asm, pattern, SSEPackedSingle>, PS, + Requires<[UseSSE1]>; +class VSSI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, !strconcat("v", asm), pattern>, XS, + Requires<[HasAVX]>; +class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, !strconcat("v", asm), pattern, SSEPackedSingle>, PS, + Requires<[HasAVX]>; + +// SSE2 Instruction Templates: +// +// SDI - SSE2 instructions with XD prefix. +// SDIi8 - SSE2 instructions with ImmT == Imm8 and XD prefix. +// S2SI - SSE2 instructions with XS prefix. +// SSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix. +// PDI - SSE2 instructions with PD prefix, packed double domain. +// PDIi8 - SSE2 instructions with ImmT == Imm8 and PD prefix. +// VSDI - SSE2 scalar instructions with XD prefix in AVX form. +// VPDI - SSE2 vector instructions with PD prefix in AVX form, +// packed double domain. +// VS2I - SSE2 scalar instructions with PD prefix in AVX form. +// S2I - SSE2 scalar instructions with PD prefix. +// MMXSDIi8 - SSE2 instructions with ImmT == Imm8 and XD prefix as well as +// MMX operands. +// MMXSSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix as well as +// MMX operands. + +class SDI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern>, XD, Requires<[UseSSE2]>; +class SDIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii8<o, F, outs, ins, asm, pattern>, XD, Requires<[UseSSE2]>; +class S2SI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern>, XS, Requires<[UseSSE2]>; +class S2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[UseSSE2]>; +class PDI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, PD, + Requires<[UseSSE2]>; +class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii8<o, F, outs, ins, asm, pattern, SSEPackedDouble>, PD, + Requires<[UseSSE2]>; +class VSDI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, !strconcat("v", asm), pattern>, XD, + Requires<[UseAVX]>; +class VS2SI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, !strconcat("v", asm), pattern>, XS, + Requires<[HasAVX]>; +class VPDI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, !strconcat("v", asm), pattern, SSEPackedDouble>, + PD, Requires<[HasAVX]>; +class VS2I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, !strconcat("v", asm), pattern>, PD, + Requires<[UseAVX]>; +class S2I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern>, PD, Requires<[UseSSE2]>; +class MMXSDIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii8<o, F, outs, ins, asm, pattern>, XD, Requires<[HasMMX, HasSSE2]>; +class MMXS2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasMMX, HasSSE2]>; + +// SSE3 Instruction Templates: +// +// S3I - SSE3 instructions with PD prefixes. +// S3SI - SSE3 instructions with XS prefix. +// S3DI - SSE3 instructions with XD prefix. + +class S3SI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, XS, + Requires<[UseSSE3]>; +class S3DI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, XD, + Requires<[UseSSE3]>; +class S3I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, PD, + Requires<[UseSSE3]>; + + +// SSSE3 Instruction Templates: +// +// SS38I - SSSE3 instructions with T8 prefix. +// SS3AI - SSSE3 instructions with TA prefix. +// MMXSS38I - SSSE3 instructions with T8 prefix and MMX operands. +// MMXSS3AI - SSSE3 instructions with TA prefix and MMX operands. +// +// Note: SSSE3 instructions have 64-bit and 128-bit versions. The 64-bit version +// uses the MMX registers. The 64-bit versions are grouped with the MMX +// classes. They need to be enabled even if AVX is enabled. + +class SS38I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD, + Requires<[UseSSSE3]>; +class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD, + Requires<[UseSSSE3]>; +class MMXSS38I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PS, + Requires<[HasMMX, HasSSSE3]>; +class MMXSS3AI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPS, + Requires<[HasMMX, HasSSSE3]>; + +// SSE4.1 Instruction Templates: +// +// SS48I - SSE 4.1 instructions with T8 prefix. +// SS41AIi8 - SSE 4.1 instructions with TA prefix and ImmT == Imm8. +// +class SS48I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD, + Requires<[UseSSE41]>; +class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD, + Requires<[UseSSE41]>; + +// SSE4.2 Instruction Templates: +// +// SS428I - SSE 4.2 instructions with T8 prefix. +class SS428I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD, + Requires<[UseSSE42]>; + +// SS42FI - SSE 4.2 instructions with T8XD prefix. +// NOTE: 'HasSSE42' is used as SS42FI is only used for CRC32 insns. +class SS42FI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern>, T8XD, Requires<[HasSSE42]>; + +// SS42AI = SSE 4.2 instructions with TA prefix +class SS42AI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD, + Requires<[UseSSE42]>; + +// AVX Instruction Templates: +// Instructions introduced in AVX (no SSE equivalent forms) +// +// AVX8I - AVX instructions with T8PD prefix. +// AVXAIi8 - AVX instructions with TAPD prefix and ImmT = Imm8. +class AVX8I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD, + Requires<[HasAVX]>; +class AVXAIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD, + Requires<[HasAVX]>; + +// AVX2 Instruction Templates: +// Instructions introduced in AVX2 (no SSE equivalent forms) +// +// AVX28I - AVX2 instructions with T8PD prefix. +// AVX2AIi8 - AVX2 instructions with TAPD prefix and ImmT = Imm8. +class AVX28I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD, + Requires<[HasAVX2]>; +class AVX2AIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD, + Requires<[HasAVX2]>; + + +// AVX-512 Instruction Templates: +// Instructions introduced in AVX-512 (no SSE equivalent forms) +// +// AVX5128I - AVX-512 instructions with T8PD prefix. +// AVX512AIi8 - AVX-512 instructions with TAPD prefix and ImmT = Imm8. +// AVX512PDI - AVX-512 instructions with PD, double packed. +// AVX512PSI - AVX-512 instructions with PS, single packed. +// AVX512XS8I - AVX-512 instructions with T8 and XS prefixes. +// AVX512XSI - AVX-512 instructions with XS prefix, generic domain. +// AVX512BI - AVX-512 instructions with PD, int packed domain. +// AVX512SI - AVX-512 scalar instructions with PD prefix. + +class AVX5128I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD, + Requires<[HasAVX512]>; +class AVX5128IBase : T8PD { + Domain ExeDomain = SSEPackedInt; +} +class AVX512XS8I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8XS, + Requires<[HasAVX512]>; +class AVX512XSI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern>, XS, + Requires<[HasAVX512]>; +class AVX512XDI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, XD, + Requires<[HasAVX512]>; +class AVX512BI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, PD, + Requires<[HasAVX512]>; +class AVX512BIBase : PD { + Domain ExeDomain = SSEPackedInt; +} +class AVX512BIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, PD, + Requires<[HasAVX512]>; +class AVX512BIi8Base : PD { + Domain ExeDomain = SSEPackedInt; + ImmType ImmT = Imm8; +} +class AVX512XSIi8Base : XS { + Domain ExeDomain = SSEPackedInt; + ImmType ImmT = Imm8; +} +class AVX512XDIi8Base : XD { + Domain ExeDomain = SSEPackedInt; + ImmType ImmT = Imm8; +} +class AVX512PSIi8Base : PS { + Domain ExeDomain = SSEPackedSingle; + ImmType ImmT = Imm8; +} +class AVX512PDIi8Base : PD { + Domain ExeDomain = SSEPackedDouble; + ImmType ImmT = Imm8; +} +class AVX512AIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD, + Requires<[HasAVX512]>; +class AVX512AIi8Base : TAPD { + ImmType ImmT = Imm8; +} +class AVX512Ii8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, + Requires<[HasAVX512]>; +class AVX512PDI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, PD, + Requires<[HasAVX512]>; +class AVX512PSI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, PS, + Requires<[HasAVX512]>; +class AVX512PIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, Domain d> + : Ii8<o, F, outs, ins, asm, pattern, d>, Requires<[HasAVX512]>; +class AVX512PI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern, Domain d> + : I<o, F, outs, ins, asm, pattern, d>, Requires<[HasAVX512]>; +class AVX512FMA3S<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern> + : I<o, F, outs, ins, asm, pattern>, T8PD, + EVEX_4V, Requires<[HasAVX512]>; +class AVX512FMA3Base : T8PD, EVEX_4V; + +class AVX512<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern> + : I<o, F, outs, ins, asm, pattern>, Requires<[HasAVX512]>; + +// AES Instruction Templates: +// +// AES8I +// These use the same encoding as the SSE4.2 T8 and TA encodings. +class AES8I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern> + : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD, + Requires<[NoAVX, HasAES]>; + +class AESAI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD, + Requires<[NoAVX, HasAES]>; + +// PCLMUL Instruction Templates +class PCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern> + : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD; + +// FMA3 Instruction Templates +class FMA3<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern> + : I<o, F, outs, ins, asm, pattern>, T8PD, + VEX_4V, FMASC, Requires<[HasFMA, NoFMA4, NoVLX]>; +class FMA3S<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern> + : I<o, F, outs, ins, asm, pattern>, T8PD, + VEX_4V, FMASC, Requires<[HasFMA, NoFMA4, NoAVX512]>; +class FMA3S_Int<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern> + : I<o, F, outs, ins, asm, pattern>, T8PD, + VEX_4V, FMASC, Requires<[HasFMA, NoAVX512]>; + +// FMA4 Instruction Templates +class FMA4<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern> + : Ii8Reg<o, F, outs, ins, asm, pattern>, TAPD, + VEX_4V, FMASC, Requires<[HasFMA4, NoVLX]>; +class FMA4S<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern> + : Ii8Reg<o, F, outs, ins, asm, pattern>, TAPD, + VEX_4V, FMASC, Requires<[HasFMA4, NoAVX512]>; +class FMA4S_Int<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern> + : Ii8Reg<o, F, outs, ins, asm, pattern>, TAPD, + VEX_4V, FMASC, Requires<[HasFMA4]>; + +// XOP 2, 3 and 4 Operand Instruction Template +class IXOP<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, + XOP9, Requires<[HasXOP]>; + +// XOP 2 and 3 Operand Instruction Templates with imm byte +class IXOPi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii8<o, F, outs, ins, asm, pattern, SSEPackedDouble>, + XOP8, Requires<[HasXOP]>; +// XOP 4 Operand Instruction Templates with imm byte +class IXOPi8Reg<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii8Reg<o, F, outs, ins, asm, pattern, SSEPackedDouble>, + XOP8, Requires<[HasXOP]>; + +// XOP 5 operand instruction (VEX encoding!) +class IXOP5<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern> + : Ii8Reg<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD, + VEX_4V, Requires<[HasXOP]>; + +// X86-64 Instruction templates... +// + +class RI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern>, REX_W; +class RIi8 <bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii8<o, F, outs, ins, asm, pattern>, REX_W; +class RIi16 <bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii16<o, F, outs, ins, asm, pattern>, REX_W; +class RIi32 <bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii32<o, F, outs, ins, asm, pattern>, REX_W; +class RIi32S <bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii32S<o, F, outs, ins, asm, pattern>, REX_W; +class RIi64<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii64<o, F, outs, ins, asm, pattern>, REX_W; + +class RS2I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : S2I<o, F, outs, ins, asm, pattern>, REX_W; +class VRS2I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : VS2I<o, F, outs, ins, asm, pattern>, VEX_W; + +// MMX Instruction templates +// + +// MMXI - MMX instructions with TB prefix. +// MMXI32 - MMX instructions with TB prefix valid only in 32 bit mode. +// MMXI64 - MMX instructions with TB prefix valid only in 64 bit mode. +// MMX2I - MMX / SSE2 instructions with PD prefix. +// MMXIi8 - MMX instructions with ImmT == Imm8 and PS prefix. +// MMXIi8 - MMX instructions with ImmT == Imm8 and PS prefix. +// MMXID - MMX instructions with XD prefix. +// MMXIS - MMX instructions with XS prefix. +class MMXI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern>, PS, Requires<[HasMMX]>; +class MMXI32<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern>, PS, Requires<[HasMMX,Not64BitMode]>; +class MMXI64<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern>, PS, Requires<[HasMMX,In64BitMode]>; +class MMXRI<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern>, PS, REX_W, Requires<[HasMMX]>; +class MMX2I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern>, PD, Requires<[HasMMX]>; +class MMXIi8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii8<o, F, outs, ins, asm, pattern>, PS, Requires<[HasMMX]>; +class MMXID<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii8<o, F, outs, ins, asm, pattern>, XD, Requires<[HasMMX]>; +class MMXIS<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasMMX]>; diff --git a/capstone/suite/synctools/tablegen/X86/back/X86InstrFragmentsSIMD.td b/capstone/suite/synctools/tablegen/X86/back/X86InstrFragmentsSIMD.td new file mode 100644 index 000000000..739275907 --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86InstrFragmentsSIMD.td @@ -0,0 +1,1075 @@ +//===-- X86InstrFragmentsSIMD.td - x86 SIMD ISA ------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file provides pattern fragments useful for SIMD instructions. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// MMX specific DAG Nodes. +//===----------------------------------------------------------------------===// + +// Low word of MMX to GPR. +def MMX_X86movd2w : SDNode<"X86ISD::MMX_MOVD2W", SDTypeProfile<1, 1, + [SDTCisVT<0, i32>, SDTCisVT<1, x86mmx>]>>; +// GPR to low word of MMX. +def MMX_X86movw2d : SDNode<"X86ISD::MMX_MOVW2D", SDTypeProfile<1, 1, + [SDTCisVT<0, x86mmx>, SDTCisVT<1, i32>]>>; + +//===----------------------------------------------------------------------===// +// MMX Pattern Fragments +//===----------------------------------------------------------------------===// + +def load_mmx : PatFrag<(ops node:$ptr), (x86mmx (load node:$ptr))>; + +//===----------------------------------------------------------------------===// +// SSE specific DAG Nodes. +//===----------------------------------------------------------------------===// + +def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisVec<0>, + SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, + SDTCisVT<3, i8>]>; + +def X86fmin : SDNode<"X86ISD::FMIN", SDTFPBinOp>; +def X86fmax : SDNode<"X86ISD::FMAX", SDTFPBinOp>; +def X86fmins : SDNode<"X86ISD::FMINS", SDTFPBinOp>; +def X86fmaxs : SDNode<"X86ISD::FMAXS", SDTFPBinOp>; + +// Commutative and Associative FMIN and FMAX. +def X86fminc : SDNode<"X86ISD::FMINC", SDTFPBinOp, + [SDNPCommutative, SDNPAssociative]>; +def X86fmaxc : SDNode<"X86ISD::FMAXC", SDTFPBinOp, + [SDNPCommutative, SDNPAssociative]>; + +def X86fand : SDNode<"X86ISD::FAND", SDTFPBinOp, + [SDNPCommutative, SDNPAssociative]>; +def X86for : SDNode<"X86ISD::FOR", SDTFPBinOp, + [SDNPCommutative, SDNPAssociative]>; +def X86fxor : SDNode<"X86ISD::FXOR", SDTFPBinOp, + [SDNPCommutative, SDNPAssociative]>; +def X86fandn : SDNode<"X86ISD::FANDN", SDTFPBinOp>; +def X86frsqrt : SDNode<"X86ISD::FRSQRT", SDTFPUnaryOp>; +def X86frcp : SDNode<"X86ISD::FRCP", SDTFPUnaryOp>; +def X86fhadd : SDNode<"X86ISD::FHADD", SDTFPBinOp>; +def X86fhsub : SDNode<"X86ISD::FHSUB", SDTFPBinOp>; +def X86hadd : SDNode<"X86ISD::HADD", SDTIntBinOp>; +def X86hsub : SDNode<"X86ISD::HSUB", SDTIntBinOp>; +def X86comi : SDNode<"X86ISD::COMI", SDTX86CmpTest>; +def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest>; +def X86cmps : SDNode<"X86ISD::FSETCC", SDTX86Cmps>; +def X86pshufb : SDNode<"X86ISD::PSHUFB", + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i8>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>>; +def X86psadbw : SDNode<"X86ISD::PSADBW", + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>, + SDTCVecEltisVT<1, i8>, + SDTCisSameSizeAs<0,1>, + SDTCisSameAs<1,2>]>, [SDNPCommutative]>; +def X86dbpsadbw : SDNode<"X86ISD::DBPSADBW", + SDTypeProfile<1, 3, [SDTCVecEltisVT<0, i16>, + SDTCVecEltisVT<1, i8>, + SDTCisSameSizeAs<0,1>, + SDTCisSameAs<1,2>, SDTCisInt<3>]>>; +def X86andnp : SDNode<"X86ISD::ANDNP", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>>; +def X86multishift : SDNode<"X86ISD::MULTISHIFT", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisSameAs<1,2>]>>; +def X86pextrb : SDNode<"X86ISD::PEXTRB", + SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, v16i8>, + SDTCisPtrTy<2>]>>; +def X86pextrw : SDNode<"X86ISD::PEXTRW", + SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, v8i16>, + SDTCisPtrTy<2>]>>; +def X86pinsrb : SDNode<"X86ISD::PINSRB", + SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>, + SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>; +def X86pinsrw : SDNode<"X86ISD::PINSRW", + SDTypeProfile<1, 3, [SDTCisVT<0, v8i16>, SDTCisSameAs<0,1>, + SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>; +def X86insertps : SDNode<"X86ISD::INSERTPS", + SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisSameAs<0,1>, + SDTCisVT<2, v4f32>, SDTCisVT<3, i8>]>>; +def X86vzmovl : SDNode<"X86ISD::VZEXT_MOVL", + SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>; + +def X86vzload : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; + +def X86vzext : SDNode<"X86ISD::VZEXT", + SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisInt<0>, SDTCisInt<1>, + SDTCisOpSmallerThanOp<1, 0>]>>; + +def X86vsext : SDNode<"X86ISD::VSEXT", + SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisInt<0>, SDTCisInt<1>, + SDTCisOpSmallerThanOp<1, 0>]>>; + +def SDTVtrunc : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisInt<0>, SDTCisInt<1>, + SDTCisOpSmallerThanOp<0, 1>]>; + +def X86vtrunc : SDNode<"X86ISD::VTRUNC", SDTVtrunc>; +def X86vtruncs : SDNode<"X86ISD::VTRUNCS", SDTVtrunc>; +def X86vtruncus : SDNode<"X86ISD::VTRUNCUS", SDTVtrunc>; + +def X86vfpext : SDNode<"X86ISD::VFPEXT", + SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f64>, + SDTCVecEltisVT<1, f32>, + SDTCisSameSizeAs<0, 1>]>>; +def X86vfpround: SDNode<"X86ISD::VFPROUND", + SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>, + SDTCVecEltisVT<1, f64>, + SDTCisSameSizeAs<0, 1>]>>; + +def X86froundRnd: SDNode<"X86ISD::VFPROUNDS_RND", + SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f32>, + SDTCisSameAs<0, 1>, + SDTCVecEltisVT<2, f64>, + SDTCisSameSizeAs<0, 2>, + SDTCisVT<3, i32>]>>; + +def X86fpextRnd : SDNode<"X86ISD::VFPEXTS_RND", + SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f64>, + SDTCisSameAs<0, 1>, + SDTCVecEltisVT<2, f32>, + SDTCisSameSizeAs<0, 2>, + SDTCisVT<3, i32>]>>; + +def X86vshiftimm : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisVT<2, i8>, SDTCisInt<0>]>; + +def X86vshldq : SDNode<"X86ISD::VSHLDQ", X86vshiftimm>; +def X86vshrdq : SDNode<"X86ISD::VSRLDQ", X86vshiftimm>; +def X86cmpp : SDNode<"X86ISD::CMPP", SDTX86VFCMP>; +def X86pcmpeq : SDNode<"X86ISD::PCMPEQ", SDTIntBinOp, [SDNPCommutative]>; +def X86pcmpgt : SDNode<"X86ISD::PCMPGT", SDTIntBinOp>; + +def X86CmpMaskCC : + SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>, + SDTCisVec<1>, SDTCisSameAs<2, 1>, + SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>]>; +def X86CmpMaskCCRound : + SDTypeProfile<1, 4, [SDTCisVec<0>,SDTCVecEltisVT<0, i1>, + SDTCisVec<1>, SDTCisFP<1>, SDTCisSameAs<2, 1>, + SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>, + SDTCisVT<4, i32>]>; +def X86CmpMaskCCScalar : + SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisFP<1>, SDTCisSameAs<1, 2>, + SDTCisVT<3, i8>]>; + +def X86CmpMaskCCScalarRound : + SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisFP<1>, SDTCisSameAs<1, 2>, + SDTCisVT<3, i8>, SDTCisVT<4, i32>]>; + +def X86cmpm : SDNode<"X86ISD::CMPM", X86CmpMaskCC>; +// Hack to make CMPM commutable in tablegen patterns for load folding. +def X86cmpm_c : SDNode<"X86ISD::CMPM", X86CmpMaskCC, [SDNPCommutative]>; +def X86cmpmRnd : SDNode<"X86ISD::CMPM_RND", X86CmpMaskCCRound>; +def X86cmpms : SDNode<"X86ISD::FSETCCM", X86CmpMaskCCScalar>; +def X86cmpmsRnd : SDNode<"X86ISD::FSETCCM_RND", X86CmpMaskCCScalarRound>; + +def X86phminpos: SDNode<"X86ISD::PHMINPOS", + SDTypeProfile<1, 1, [SDTCisVT<0, v8i16>, SDTCisVT<1, v8i16>]>>; + +def X86vshiftuniform : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisVec<2>, SDTCisInt<0>, + SDTCisInt<1>]>; + +def X86vshl : SDNode<"X86ISD::VSHL", X86vshiftuniform>; +def X86vsrl : SDNode<"X86ISD::VSRL", X86vshiftuniform>; +def X86vsra : SDNode<"X86ISD::VSRA", X86vshiftuniform>; + +def X86vshiftvariable : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, SDTCisInt<0>]>; + +def X86vsrav : SDNode<"X86ISD::VSRAV", X86vshiftvariable>; + +def X86vshli : SDNode<"X86ISD::VSHLI", X86vshiftimm>; +def X86vsrli : SDNode<"X86ISD::VSRLI", X86vshiftimm>; +def X86vsrai : SDNode<"X86ISD::VSRAI", X86vshiftimm>; + +def X86kshiftl : SDNode<"X86ISD::KSHIFTL", + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i1>, + SDTCisSameAs<0, 1>, + SDTCisVT<2, i8>]>>; +def X86kshiftr : SDNode<"X86ISD::KSHIFTR", + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i1>, + SDTCisSameAs<0, 1>, + SDTCisVT<2, i8>]>>; + +def X86kadd : SDNode<"X86ISD::KADD", SDTIntBinOp, [SDNPCommutative]>; + +def X86vrotli : SDNode<"X86ISD::VROTLI", X86vshiftimm>; +def X86vrotri : SDNode<"X86ISD::VROTRI", X86vshiftimm>; + +def X86vpshl : SDNode<"X86ISD::VPSHL", X86vshiftvariable>; +def X86vpsha : SDNode<"X86ISD::VPSHA", X86vshiftvariable>; + +def X86vpcom : SDNode<"X86ISD::VPCOM", + SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, + SDTCisVT<3, i8>, SDTCisInt<0>]>>; +def X86vpcomu : SDNode<"X86ISD::VPCOMU", + SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, + SDTCisVT<3, i8>, SDTCisInt<0>]>>; +def X86vpermil2 : SDNode<"X86ISD::VPERMIL2", + SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, + SDTCisFP<0>, SDTCisInt<3>, + SDTCisSameNumEltsAs<0, 3>, + SDTCisSameSizeAs<0,3>, + SDTCisVT<4, i8>]>>; +def X86vpperm : SDNode<"X86ISD::VPPERM", + SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, SDTCisSameAs<0, 3>]>>; + +def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, + SDTCisVec<1>, + SDTCisSameAs<2, 1>]>; + +def X86addus : SDNode<"X86ISD::ADDUS", SDTIntBinOp, [SDNPCommutative]>; +def X86subus : SDNode<"X86ISD::SUBUS", SDTIntBinOp>; +def X86adds : SDNode<"X86ISD::ADDS", SDTIntBinOp, [SDNPCommutative]>; +def X86subs : SDNode<"X86ISD::SUBS", SDTIntBinOp>; +def X86mulhrs : SDNode<"X86ISD::MULHRS", SDTIntBinOp, [SDNPCommutative]>; +def X86avg : SDNode<"X86ISD::AVG" , SDTIntBinOp, [SDNPCommutative]>; +def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>; +def X86testp : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>; +def X86kortest : SDNode<"X86ISD::KORTEST", SDTX86CmpPTest>; +def X86ktest : SDNode<"X86ISD::KTEST", SDTX86CmpPTest>; + +def X86movmsk : SDNode<"X86ISD::MOVMSK", + SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVec<1>]>>; + +def X86selects : SDNode<"X86ISD::SELECTS", + SDTypeProfile<1, 3, [SDTCisVT<1, v1i1>, + SDTCisSameAs<0, 2>, + SDTCisSameAs<2, 3>]>>; + +def X86pmuludq : SDNode<"X86ISD::PMULUDQ", + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>, + SDTCisSameAs<0,1>, + SDTCisSameAs<1,2>]>, + [SDNPCommutative]>; +def X86pmuldq : SDNode<"X86ISD::PMULDQ", + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>, + SDTCisSameAs<0,1>, + SDTCisSameAs<1,2>]>, + [SDNPCommutative]>; + +def X86extrqi : SDNode<"X86ISD::EXTRQI", + SDTypeProfile<1, 3, [SDTCisVT<0, v2i64>, SDTCisSameAs<0,1>, + SDTCisVT<2, i8>, SDTCisVT<3, i8>]>>; +def X86insertqi : SDNode<"X86ISD::INSERTQI", + SDTypeProfile<1, 4, [SDTCisVT<0, v2i64>, SDTCisSameAs<0,1>, + SDTCisSameAs<1,2>, SDTCisVT<3, i8>, + SDTCisVT<4, i8>]>>; + +// Specific shuffle nodes - At some point ISD::VECTOR_SHUFFLE will always get +// translated into one of the target nodes below during lowering. +// Note: this is a work in progress... +def SDTShuff1Op : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>; +def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>; +def SDTShuff2OpFP : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisFP<0>, + SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>; + +def SDTShuff2OpM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisFP<0>, SDTCisInt<2>, + SDTCisSameNumEltsAs<0,2>, + SDTCisSameSizeAs<0,2>]>; +def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>, + SDTCisSameAs<0,1>, SDTCisVT<2, i8>]>; +def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, SDTCisVT<3, i8>]>; +def SDTFPBinOpImm: SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisVec<0>, + SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, + SDTCisVT<3, i32>]>; +def SDTFPBinOpImmRound: SDTypeProfile<1, 4, [SDTCisFP<0>, SDTCisVec<0>, + SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, + SDTCisVT<3, i32>, + SDTCisVT<4, i32>]>; +def SDTFPTernaryOpImmRound: SDTypeProfile<1, 5, [SDTCisFP<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, + SDTCisInt<3>, + SDTCisSameSizeAs<0, 3>, + SDTCisSameNumEltsAs<0, 3>, + SDTCisVT<4, i32>, + SDTCisVT<5, i32>]>; +def SDTFPUnaryOpImm: SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisVec<0>, + SDTCisSameAs<0,1>, + SDTCisVT<2, i32>]>; +def SDTFPUnaryOpImmRound: SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisVec<0>, + SDTCisSameAs<0,1>, + SDTCisVT<2, i32>, + SDTCisVT<3, i32>]>; + +def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>; +def SDTVBroadcastm : SDTypeProfile<1, 1, [SDTCisVec<0>, + SDTCisInt<0>, SDTCisInt<1>]>; + +def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<1,2>, SDTCisVT<3, i8>]>; + +def SDTTernlog : SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisVec<0>, + SDTCisSameAs<0,1>, SDTCisSameAs<0,2>, + SDTCisSameAs<0,3>, SDTCisVT<4, i8>]>; + +def SDTFPBinOpRound : SDTypeProfile<1, 3, [ // fadd_round, fmul_round, etc. + SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisFP<0>, SDTCisVT<3, i32>]>; + +def SDTFPUnaryOpRound : SDTypeProfile<1, 2, [ // fsqrt_round, fgetexp_round, etc. + SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisVT<2, i32>]>; + +def SDTFmaRound : SDTypeProfile<1, 4, [SDTCisSameAs<0,1>, + SDTCisSameAs<1,2>, SDTCisSameAs<1,3>, + SDTCisFP<0>, SDTCisVT<4, i32>]>; + +def X86PAlignr : SDNode<"X86ISD::PALIGNR", + SDTypeProfile<1, 3, [SDTCVecEltisVT<0, i8>, + SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, + SDTCisVT<3, i8>]>>; +def X86VAlign : SDNode<"X86ISD::VALIGN", SDTShuff3OpI>; + +def X86VShld : SDNode<"X86ISD::VSHLD", SDTShuff3OpI>; +def X86VShrd : SDNode<"X86ISD::VSHRD", SDTShuff3OpI>; +def X86VShldv : SDNode<"X86ISD::VSHLDV", + SDTypeProfile<1, 3, [SDTCisVec<0>, + SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, + SDTCisSameAs<0,3>]>>; +def X86VShrdv : SDNode<"X86ISD::VSHRDV", + SDTypeProfile<1, 3, [SDTCisVec<0>, + SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, + SDTCisSameAs<0,3>]>>; + +def X86Conflict : SDNode<"X86ISD::CONFLICT", SDTIntUnaryOp>; + +def X86PShufd : SDNode<"X86ISD::PSHUFD", SDTShuff2OpI>; +def X86PShufhw : SDNode<"X86ISD::PSHUFHW", SDTShuff2OpI>; +def X86PShuflw : SDNode<"X86ISD::PSHUFLW", SDTShuff2OpI>; + +def X86Shufp : SDNode<"X86ISD::SHUFP", SDTShuff3OpI>; +def X86Shuf128 : SDNode<"X86ISD::SHUF128", SDTShuff3OpI>; + +def X86Movddup : SDNode<"X86ISD::MOVDDUP", SDTShuff1Op>; +def X86Movshdup : SDNode<"X86ISD::MOVSHDUP", SDTShuff1Op>; +def X86Movsldup : SDNode<"X86ISD::MOVSLDUP", SDTShuff1Op>; + +def X86Movsd : SDNode<"X86ISD::MOVSD", SDTShuff2OpFP>; +def X86Movss : SDNode<"X86ISD::MOVSS", SDTShuff2OpFP>; + +def X86Movlhps : SDNode<"X86ISD::MOVLHPS", SDTShuff2OpFP>; +def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2OpFP>; + +def SDTPack : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<0>, + SDTCisVec<1>, SDTCisInt<1>, + SDTCisSameSizeAs<0,1>, + SDTCisSameAs<1,2>, + SDTCisOpSmallerThanOp<0, 1>]>; +def X86Packss : SDNode<"X86ISD::PACKSS", SDTPack>; +def X86Packus : SDNode<"X86ISD::PACKUS", SDTPack>; + +def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>; +def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>; + +def X86vpmaddubsw : SDNode<"X86ISD::VPMADDUBSW", + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i16>, + SDTCVecEltisVT<1, i8>, + SDTCisSameSizeAs<0,1>, + SDTCisSameAs<1,2>]>>; +def X86vpmaddwd : SDNode<"X86ISD::VPMADDWD", + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i32>, + SDTCVecEltisVT<1, i16>, + SDTCisSameSizeAs<0,1>, + SDTCisSameAs<1,2>]>, + [SDNPCommutative]>; + +def X86VPermilpv : SDNode<"X86ISD::VPERMILPV", SDTShuff2OpM>; +def X86VPermilpi : SDNode<"X86ISD::VPERMILPI", SDTShuff2OpI>; +def X86VPermv : SDNode<"X86ISD::VPERMV", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<1>, + SDTCisSameNumEltsAs<0,1>, + SDTCisSameSizeAs<0,1>, + SDTCisSameAs<0,2>]>>; +def X86VPermi : SDNode<"X86ISD::VPERMI", SDTShuff2OpI>; +def X86VPermt2 : SDNode<"X86ISD::VPERMV3", + SDTypeProfile<1, 3, [SDTCisVec<0>, + SDTCisSameAs<0,1>, SDTCisInt<2>, + SDTCisVec<2>, SDTCisSameNumEltsAs<0, 2>, + SDTCisSameSizeAs<0,2>, + SDTCisSameAs<0,3>]>, []>; + +def X86vpternlog : SDNode<"X86ISD::VPTERNLOG", SDTTernlog>; + +def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>; + +def X86VFixupimm : SDNode<"X86ISD::VFIXUPIMM", SDTFPTernaryOpImmRound>; +def X86VFixupimmScalar : SDNode<"X86ISD::VFIXUPIMMS", SDTFPTernaryOpImmRound>; +def X86VRange : SDNode<"X86ISD::VRANGE", SDTFPBinOpImm>; +def X86VRangeRnd : SDNode<"X86ISD::VRANGE_RND", SDTFPBinOpImmRound>; +def X86VReduce : SDNode<"X86ISD::VREDUCE", SDTFPUnaryOpImm>; +def X86VReduceRnd : SDNode<"X86ISD::VREDUCE_RND", SDTFPUnaryOpImmRound>; +def X86VRndScale : SDNode<"X86ISD::VRNDSCALE", SDTFPUnaryOpImm>; +def X86VRndScaleRnd: SDNode<"X86ISD::VRNDSCALE_RND", SDTFPUnaryOpImmRound>; +def X86VGetMant : SDNode<"X86ISD::VGETMANT", SDTFPUnaryOpImm>; +def X86VGetMantRnd : SDNode<"X86ISD::VGETMANT_RND", SDTFPUnaryOpImmRound>; +def X86Vfpclass : SDNode<"X86ISD::VFPCLASS", + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i1>, + SDTCisFP<1>, + SDTCisSameNumEltsAs<0,1>, + SDTCisVT<2, i32>]>, []>; +def X86Vfpclasss : SDNode<"X86ISD::VFPCLASSS", + SDTypeProfile<1, 2, [SDTCisVT<0, v1i1>, + SDTCisFP<1>, SDTCisVT<2, i32>]>,[]>; + +def X86SubVBroadcast : SDNode<"X86ISD::SUBV_BROADCAST", + SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisSubVecOfVec<1, 0>]>, []>; + +def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>; +def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>; + +def X86Blendi : SDNode<"X86ISD::BLENDI", SDTBlend>; + +def X86Addsub : SDNode<"X86ISD::ADDSUB", SDTFPBinOp>; + +def X86faddRnd : SDNode<"X86ISD::FADD_RND", SDTFPBinOpRound>; +def X86faddRnds : SDNode<"X86ISD::FADDS_RND", SDTFPBinOpRound>; +def X86fsubRnd : SDNode<"X86ISD::FSUB_RND", SDTFPBinOpRound>; +def X86fsubRnds : SDNode<"X86ISD::FSUBS_RND", SDTFPBinOpRound>; +def X86fmulRnd : SDNode<"X86ISD::FMUL_RND", SDTFPBinOpRound>; +def X86fmulRnds : SDNode<"X86ISD::FMULS_RND", SDTFPBinOpRound>; +def X86fdivRnd : SDNode<"X86ISD::FDIV_RND", SDTFPBinOpRound>; +def X86fdivRnds : SDNode<"X86ISD::FDIVS_RND", SDTFPBinOpRound>; +def X86fmaxRnd : SDNode<"X86ISD::FMAX_RND", SDTFPBinOpRound>; +def X86fmaxRnds : SDNode<"X86ISD::FMAXS_RND", SDTFPBinOpRound>; +def X86fminRnd : SDNode<"X86ISD::FMIN_RND", SDTFPBinOpRound>; +def X86fminRnds : SDNode<"X86ISD::FMINS_RND", SDTFPBinOpRound>; +def X86scalef : SDNode<"X86ISD::SCALEF", SDTFPBinOpRound>; +def X86scalefs : SDNode<"X86ISD::SCALEFS", SDTFPBinOpRound>; +def X86fsqrtRnd : SDNode<"X86ISD::FSQRT_RND", SDTFPUnaryOpRound>; +def X86fsqrtRnds : SDNode<"X86ISD::FSQRTS_RND", SDTFPBinOpRound>; +def X86fgetexpRnd : SDNode<"X86ISD::FGETEXP_RND", SDTFPUnaryOpRound>; +def X86fgetexpRnds : SDNode<"X86ISD::FGETEXPS_RND", SDTFPBinOpRound>; + +def X86Fmadd : SDNode<"ISD::FMA", SDTFPTernaryOp, [SDNPCommutative]>; +def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFPTernaryOp, [SDNPCommutative]>; +def X86Fmsub : SDNode<"X86ISD::FMSUB", SDTFPTernaryOp, [SDNPCommutative]>; +def X86Fnmsub : SDNode<"X86ISD::FNMSUB", SDTFPTernaryOp, [SDNPCommutative]>; +def X86Fmaddsub : SDNode<"X86ISD::FMADDSUB", SDTFPTernaryOp, [SDNPCommutative]>; +def X86Fmsubadd : SDNode<"X86ISD::FMSUBADD", SDTFPTernaryOp, [SDNPCommutative]>; + +def X86FmaddRnd : SDNode<"X86ISD::FMADD_RND", SDTFmaRound, [SDNPCommutative]>; +def X86FnmaddRnd : SDNode<"X86ISD::FNMADD_RND", SDTFmaRound, [SDNPCommutative]>; +def X86FmsubRnd : SDNode<"X86ISD::FMSUB_RND", SDTFmaRound, [SDNPCommutative]>; +def X86FnmsubRnd : SDNode<"X86ISD::FNMSUB_RND", SDTFmaRound, [SDNPCommutative]>; +def X86FmaddsubRnd : SDNode<"X86ISD::FMADDSUB_RND", SDTFmaRound, [SDNPCommutative]>; +def X86FmsubaddRnd : SDNode<"X86ISD::FMSUBADD_RND", SDTFmaRound, [SDNPCommutative]>; + +def SDTIFma : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>; +def x86vpmadd52l : SDNode<"X86ISD::VPMADD52L", SDTIFma, [SDNPCommutative]>; +def x86vpmadd52h : SDNode<"X86ISD::VPMADD52H", SDTIFma, [SDNPCommutative]>; + +def X86rsqrt14 : SDNode<"X86ISD::RSQRT14", SDTFPUnaryOp>; +def X86rcp14 : SDNode<"X86ISD::RCP14", SDTFPUnaryOp>; + +// VNNI +def SDTVnni : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>; +def X86Vpdpbusd : SDNode<"X86ISD::VPDPBUSD", SDTVnni>; +def X86Vpdpbusds : SDNode<"X86ISD::VPDPBUSDS", SDTVnni>; +def X86Vpdpwssd : SDNode<"X86ISD::VPDPWSSD", SDTVnni>; +def X86Vpdpwssds : SDNode<"X86ISD::VPDPWSSDS", SDTVnni>; + +def X86rsqrt28 : SDNode<"X86ISD::RSQRT28", SDTFPUnaryOpRound>; +def X86rcp28 : SDNode<"X86ISD::RCP28", SDTFPUnaryOpRound>; +def X86exp2 : SDNode<"X86ISD::EXP2", SDTFPUnaryOpRound>; + +def X86rsqrt14s : SDNode<"X86ISD::RSQRT14S", SDTFPBinOp>; +def X86rcp14s : SDNode<"X86ISD::RCP14S", SDTFPBinOp>; +def X86rsqrt28s : SDNode<"X86ISD::RSQRT28S", SDTFPBinOpRound>; +def X86rcp28s : SDNode<"X86ISD::RCP28S", SDTFPBinOpRound>; +def X86Ranges : SDNode<"X86ISD::VRANGES", SDTFPBinOpImm>; +def X86RndScales : SDNode<"X86ISD::VRNDSCALES", SDTFPBinOpImm>; +def X86Reduces : SDNode<"X86ISD::VREDUCES", SDTFPBinOpImm>; +def X86GetMants : SDNode<"X86ISD::VGETMANTS", SDTFPBinOpImm>; +def X86RangesRnd : SDNode<"X86ISD::VRANGES_RND", SDTFPBinOpImmRound>; +def X86RndScalesRnd : SDNode<"X86ISD::VRNDSCALES_RND", SDTFPBinOpImmRound>; +def X86ReducesRnd : SDNode<"X86ISD::VREDUCES_RND", SDTFPBinOpImmRound>; +def X86GetMantsRnd : SDNode<"X86ISD::VGETMANTS_RND", SDTFPBinOpImmRound>; + +def X86compress: SDNode<"X86ISD::COMPRESS", SDTypeProfile<1, 1, + [SDTCisSameAs<0, 1>, SDTCisVec<1>]>, []>; +def X86expand : SDNode<"X86ISD::EXPAND", SDTypeProfile<1, 1, + [SDTCisSameAs<0, 1>, SDTCisVec<1>]>, []>; + +// vpshufbitqmb +def X86Vpshufbitqmb : SDNode<"X86ISD::VPSHUFBITQMB", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisSameAs<1,2>, + SDTCVecEltisVT<0,i1>, + SDTCisSameNumEltsAs<0,1>]>>; + +def SDTintToFPRound: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisFP<0>, + SDTCisSameAs<0,1>, SDTCisInt<2>, + SDTCisVT<3, i32>]>; + +def SDTFloatToInt: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisInt<0>, SDTCisFP<1>]>; +def SDTFloatToIntRnd: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisInt<0>, SDTCisFP<1>, + SDTCisVT<2, i32>]>; +def SDTSFloatToIntRnd: SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisFP<1>, + SDTCisVec<1>, SDTCisVT<2, i32>]>; + +def SDTVintToFP: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisFP<0>, SDTCisInt<1>]>; +def SDTVintToFPRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisFP<0>, SDTCisInt<1>, + SDTCisVT<2, i32>]>; + +// Scalar +def X86SintToFpRnd : SDNode<"X86ISD::SCALAR_SINT_TO_FP_RND", SDTintToFPRound>; +def X86UintToFpRnd : SDNode<"X86ISD::SCALAR_UINT_TO_FP_RND", SDTintToFPRound>; + +def X86cvtts2IntRnd : SDNode<"X86ISD::CVTTS2SI_RND", SDTSFloatToIntRnd>; +def X86cvtts2UIntRnd : SDNode<"X86ISD::CVTTS2UI_RND", SDTSFloatToIntRnd>; + +def X86cvts2si : SDNode<"X86ISD::CVTS2SI_RND", SDTSFloatToIntRnd>; +def X86cvts2usi : SDNode<"X86ISD::CVTS2UI_RND", SDTSFloatToIntRnd>; + +// Vector with rounding mode + +// cvtt fp-to-int staff +def X86cvttp2siRnd : SDNode<"X86ISD::CVTTP2SI_RND", SDTFloatToIntRnd>; +def X86cvttp2uiRnd : SDNode<"X86ISD::CVTTP2UI_RND", SDTFloatToIntRnd>; + +def X86VSintToFpRnd : SDNode<"X86ISD::SINT_TO_FP_RND", SDTVintToFPRound>; +def X86VUintToFpRnd : SDNode<"X86ISD::UINT_TO_FP_RND", SDTVintToFPRound>; + +// cvt fp-to-int staff +def X86cvtp2IntRnd : SDNode<"X86ISD::CVTP2SI_RND", SDTFloatToIntRnd>; +def X86cvtp2UIntRnd : SDNode<"X86ISD::CVTP2UI_RND", SDTFloatToIntRnd>; + +// Vector without rounding mode + +// cvtt fp-to-int staff +def X86cvttp2si : SDNode<"X86ISD::CVTTP2SI", SDTFloatToInt>; +def X86cvttp2ui : SDNode<"X86ISD::CVTTP2UI", SDTFloatToInt>; + +def X86VSintToFP : SDNode<"X86ISD::CVTSI2P", SDTVintToFP>; +def X86VUintToFP : SDNode<"X86ISD::CVTUI2P", SDTVintToFP>; + +// cvt int-to-fp staff +def X86cvtp2Int : SDNode<"X86ISD::CVTP2SI", SDTFloatToInt>; +def X86cvtp2UInt : SDNode<"X86ISD::CVTP2UI", SDTFloatToInt>; + + +def X86cvtph2ps : SDNode<"X86ISD::CVTPH2PS", + SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>, + SDTCVecEltisVT<1, i16>]> >; + +def X86cvtph2psRnd : SDNode<"X86ISD::CVTPH2PS_RND", + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>, + SDTCVecEltisVT<1, i16>, + SDTCisVT<2, i32>]> >; + +def X86cvtps2ph : SDNode<"X86ISD::CVTPS2PH", + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i16>, + SDTCVecEltisVT<1, f32>, + SDTCisVT<2, i32>]> >; +def X86vfpextRnd : SDNode<"X86ISD::VFPEXT_RND", + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f64>, + SDTCVecEltisVT<1, f32>, + SDTCisOpSmallerThanOp<1, 0>, + SDTCisVT<2, i32>]>>; +def X86vfproundRnd: SDNode<"X86ISD::VFPROUND_RND", + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>, + SDTCVecEltisVT<1, f64>, + SDTCisOpSmallerThanOp<0, 1>, + SDTCisVT<2, i32>]>>; + +// galois field arithmetic +def X86GF2P8affineinvqb : SDNode<"X86ISD::GF2P8AFFINEINVQB", SDTBlend>; +def X86GF2P8affineqb : SDNode<"X86ISD::GF2P8AFFINEQB", SDTBlend>; +def X86GF2P8mulb : SDNode<"X86ISD::GF2P8MULB", SDTIntBinOp>; + +//===----------------------------------------------------------------------===// +// SSE Complex Patterns +//===----------------------------------------------------------------------===// + +// These are 'extloads' from a scalar to the low element of a vector, zeroing +// the top elements. These are used for the SSE 'ss' and 'sd' instruction +// forms. +def sse_load_f32 : ComplexPattern<v4f32, 5, "selectScalarSSELoad", [], + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, + SDNPWantRoot, SDNPWantParent]>; +def sse_load_f64 : ComplexPattern<v2f64, 5, "selectScalarSSELoad", [], + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, + SDNPWantRoot, SDNPWantParent]>; + +def ssmem : Operand<v4f32> { + let PrintMethod = "printf32mem"; + let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, SEGMENT_REG); + let ParserMatchClass = X86Mem32AsmOperand; + let OperandType = "OPERAND_MEMORY"; +} +def sdmem : Operand<v2f64> { + let PrintMethod = "printf64mem"; + let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, SEGMENT_REG); + let ParserMatchClass = X86Mem64AsmOperand; + let OperandType = "OPERAND_MEMORY"; +} + +//===----------------------------------------------------------------------===// +// SSE pattern fragments +//===----------------------------------------------------------------------===// + +// Vector load wrappers to prevent folding of non-temporal aligned loads on +// supporting targets. +def vecload : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return !useNonTemporalLoad(cast<LoadSDNode>(N)); +}]>; + +// 128-bit load pattern fragments +// NOTE: all 128-bit integer vector loads are promoted to v2i64 +def loadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (vecload node:$ptr))>; +def loadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (vecload node:$ptr))>; +def loadv2i64 : PatFrag<(ops node:$ptr), (v2i64 (vecload node:$ptr))>; + +// 256-bit load pattern fragments +// NOTE: all 256-bit integer vector loads are promoted to v4i64 +def loadv8f32 : PatFrag<(ops node:$ptr), (v8f32 (vecload node:$ptr))>; +def loadv4f64 : PatFrag<(ops node:$ptr), (v4f64 (vecload node:$ptr))>; +def loadv4i64 : PatFrag<(ops node:$ptr), (v4i64 (vecload node:$ptr))>; + +// 512-bit load pattern fragments +def loadv16f32 : PatFrag<(ops node:$ptr), (v16f32 (vecload node:$ptr))>; +def loadv8f64 : PatFrag<(ops node:$ptr), (v8f64 (vecload node:$ptr))>; +def loadv8i64 : PatFrag<(ops node:$ptr), (v8i64 (vecload node:$ptr))>; + +// 128-/256-/512-bit extload pattern fragments +def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>; +def extloadv4f32 : PatFrag<(ops node:$ptr), (v4f64 (extloadvf32 node:$ptr))>; +def extloadv8f32 : PatFrag<(ops node:$ptr), (v8f64 (extloadvf32 node:$ptr))>; + +// Like 'store', but always requires vector size alignment. +def alignedstore : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + auto *St = cast<StoreSDNode>(N); + return St->getAlignment() >= St->getMemoryVT().getStoreSize(); +}]>; + +// Like 'load', but always requires 128-bit vector alignment. +def alignedvecload : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + auto *Ld = cast<LoadSDNode>(N); + return Ld->getAlignment() >= Ld->getMemoryVT().getStoreSize() && + !useNonTemporalLoad(cast<LoadSDNode>(N)); +}]>; + +// 128-bit aligned load pattern fragments +// NOTE: all 128-bit integer vector loads are promoted to v2i64 +def alignedloadv4f32 : PatFrag<(ops node:$ptr), + (v4f32 (alignedvecload node:$ptr))>; +def alignedloadv2f64 : PatFrag<(ops node:$ptr), + (v2f64 (alignedvecload node:$ptr))>; +def alignedloadv2i64 : PatFrag<(ops node:$ptr), + (v2i64 (alignedvecload node:$ptr))>; + +// 256-bit aligned load pattern fragments +// NOTE: all 256-bit integer vector loads are promoted to v4i64 +def alignedloadv8f32 : PatFrag<(ops node:$ptr), + (v8f32 (alignedvecload node:$ptr))>; +def alignedloadv4f64 : PatFrag<(ops node:$ptr), + (v4f64 (alignedvecload node:$ptr))>; +def alignedloadv4i64 : PatFrag<(ops node:$ptr), + (v4i64 (alignedvecload node:$ptr))>; + +// 512-bit aligned load pattern fragments +def alignedloadv16f32 : PatFrag<(ops node:$ptr), + (v16f32 (alignedvecload node:$ptr))>; +def alignedloadv8f64 : PatFrag<(ops node:$ptr), + (v8f64 (alignedvecload node:$ptr))>; +def alignedloadv8i64 : PatFrag<(ops node:$ptr), + (v8i64 (alignedvecload node:$ptr))>; + +// Like 'vecload', but uses special alignment checks suitable for use in +// memory operands in most SSE instructions, which are required to +// be naturally aligned on some targets but not on others. If the subtarget +// allows unaligned accesses, match any load, though this may require +// setting a feature bit in the processor (on startup, for example). +// Opteron 10h and later implement such a feature. +def memop : PatFrag<(ops node:$ptr), (vecload node:$ptr), [{ + auto *Ld = cast<LoadSDNode>(N); + return Subtarget->hasSSEUnalignedMem() || + Ld->getAlignment() >= Ld->getMemoryVT().getStoreSize(); +}]>; + +// 128-bit memop pattern fragments +// NOTE: all 128-bit integer vector loads are promoted to v2i64 +def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>; +def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>; +def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>; + +def X86masked_gather : SDNode<"X86ISD::MGATHER", + SDTypeProfile<2, 3, [SDTCisVec<0>, + SDTCisVec<1>, SDTCisInt<1>, + SDTCisSameAs<0, 2>, + SDTCisSameAs<1, 3>, + SDTCisPtrTy<4>]>, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; + +def X86masked_scatter : SDNode<"X86ISD::MSCATTER", + SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisSameAs<0, 2>, + SDTCVecEltisVT<0, i1>, + SDTCisPtrTy<3>]>, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; + +def mgatherv4i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86masked_gather node:$src1, node:$src2, node:$src3) , [{ + X86MaskedGatherSDNode *Mgt = cast<X86MaskedGatherSDNode>(N); + return Mgt->getIndex().getValueType() == MVT::v4i32; +}]>; + +def mgatherv8i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86masked_gather node:$src1, node:$src2, node:$src3) , [{ + X86MaskedGatherSDNode *Mgt = cast<X86MaskedGatherSDNode>(N); + return Mgt->getIndex().getValueType() == MVT::v8i32; +}]>; + +def mgatherv2i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86masked_gather node:$src1, node:$src2, node:$src3) , [{ + X86MaskedGatherSDNode *Mgt = cast<X86MaskedGatherSDNode>(N); + return Mgt->getIndex().getValueType() == MVT::v2i64; +}]>; +def mgatherv4i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86masked_gather node:$src1, node:$src2, node:$src3) , [{ + X86MaskedGatherSDNode *Mgt = cast<X86MaskedGatherSDNode>(N); + return Mgt->getIndex().getValueType() == MVT::v4i64; +}]>; +def mgatherv8i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86masked_gather node:$src1, node:$src2, node:$src3) , [{ + X86MaskedGatherSDNode *Mgt = cast<X86MaskedGatherSDNode>(N); + return Mgt->getIndex().getValueType() == MVT::v8i64; +}]>; +def mgatherv16i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86masked_gather node:$src1, node:$src2, node:$src3) , [{ + X86MaskedGatherSDNode *Mgt = cast<X86MaskedGatherSDNode>(N); + return Mgt->getIndex().getValueType() == MVT::v16i32; +}]>; + +def mscatterv2i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86masked_scatter node:$src1, node:$src2, node:$src3) , [{ + X86MaskedScatterSDNode *Sc = cast<X86MaskedScatterSDNode>(N); + return Sc->getIndex().getValueType() == MVT::v2i64; +}]>; + +def mscatterv4i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86masked_scatter node:$src1, node:$src2, node:$src3) , [{ + X86MaskedScatterSDNode *Sc = cast<X86MaskedScatterSDNode>(N); + return Sc->getIndex().getValueType() == MVT::v4i32; +}]>; + +def mscatterv4i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86masked_scatter node:$src1, node:$src2, node:$src3) , [{ + X86MaskedScatterSDNode *Sc = cast<X86MaskedScatterSDNode>(N); + return Sc->getIndex().getValueType() == MVT::v4i64; +}]>; + +def mscatterv8i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86masked_scatter node:$src1, node:$src2, node:$src3) , [{ + X86MaskedScatterSDNode *Sc = cast<X86MaskedScatterSDNode>(N); + return Sc->getIndex().getValueType() == MVT::v8i32; +}]>; + +def mscatterv8i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86masked_scatter node:$src1, node:$src2, node:$src3) , [{ + X86MaskedScatterSDNode *Sc = cast<X86MaskedScatterSDNode>(N); + return Sc->getIndex().getValueType() == MVT::v8i64; +}]>; +def mscatterv16i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86masked_scatter node:$src1, node:$src2, node:$src3) , [{ + X86MaskedScatterSDNode *Sc = cast<X86MaskedScatterSDNode>(N); + return Sc->getIndex().getValueType() == MVT::v16i32; +}]>; + +// 128-bit bitconvert pattern fragments +def bc_v4f32 : PatFrag<(ops node:$in), (v4f32 (bitconvert node:$in))>; +def bc_v2f64 : PatFrag<(ops node:$in), (v2f64 (bitconvert node:$in))>; +def bc_v16i8 : PatFrag<(ops node:$in), (v16i8 (bitconvert node:$in))>; +def bc_v8i16 : PatFrag<(ops node:$in), (v8i16 (bitconvert node:$in))>; +def bc_v4i32 : PatFrag<(ops node:$in), (v4i32 (bitconvert node:$in))>; +def bc_v2i64 : PatFrag<(ops node:$in), (v2i64 (bitconvert node:$in))>; + +// 256-bit bitconvert pattern fragments +def bc_v32i8 : PatFrag<(ops node:$in), (v32i8 (bitconvert node:$in))>; +def bc_v16i16 : PatFrag<(ops node:$in), (v16i16 (bitconvert node:$in))>; +def bc_v8i32 : PatFrag<(ops node:$in), (v8i32 (bitconvert node:$in))>; +def bc_v4i64 : PatFrag<(ops node:$in), (v4i64 (bitconvert node:$in))>; +def bc_v8f32 : PatFrag<(ops node:$in), (v8f32 (bitconvert node:$in))>; +def bc_v4f64 : PatFrag<(ops node:$in), (v4f64 (bitconvert node:$in))>; + +// 512-bit bitconvert pattern fragments +def bc_v64i8 : PatFrag<(ops node:$in), (v64i8 (bitconvert node:$in))>; +def bc_v16i32 : PatFrag<(ops node:$in), (v16i32 (bitconvert node:$in))>; +def bc_v8i64 : PatFrag<(ops node:$in), (v8i64 (bitconvert node:$in))>; +def bc_v8f64 : PatFrag<(ops node:$in), (v8f64 (bitconvert node:$in))>; +def bc_v16f32 : PatFrag<(ops node:$in), (v16f32 (bitconvert node:$in))>; + +def vzmovl_v2i64 : PatFrag<(ops node:$src), + (bitconvert (v2i64 (X86vzmovl + (v2i64 (scalar_to_vector (loadi64 node:$src))))))>; +def vzmovl_v4i32 : PatFrag<(ops node:$src), + (bitconvert (v4i32 (X86vzmovl + (v4i32 (scalar_to_vector (loadi32 node:$src))))))>; + +def vzload_v2i64 : PatFrag<(ops node:$src), + (bitconvert (v2i64 (X86vzload node:$src)))>; + + +def fp32imm0 : PatLeaf<(f32 fpimm), [{ + return N->isExactlyValue(+0.0); +}]>; + +def fp64imm0 : PatLeaf<(f64 fpimm), [{ + return N->isExactlyValue(+0.0); +}]>; + +def I8Imm : SDNodeXForm<imm, [{ + // Transformation function: get the low 8 bits. + return getI8Imm((uint8_t)N->getZExtValue(), SDLoc(N)); +}]>; + +def FROUND_NO_EXC : PatLeaf<(i32 8)>; +def FROUND_CURRENT : PatLeaf<(i32 4)>; + +// BYTE_imm - Transform bit immediates into byte immediates. +def BYTE_imm : SDNodeXForm<imm, [{ + // Transformation function: imm >> 3 + return getI32Imm(N->getZExtValue() >> 3, SDLoc(N)); +}]>; + +// EXTRACT_get_vextract128_imm xform function: convert extract_subvector index +// to VEXTRACTF128/VEXTRACTI128 imm. +def EXTRACT_get_vextract128_imm : SDNodeXForm<extract_subvector, [{ + return getExtractVEXTRACTImmediate(N, 128, SDLoc(N)); +}]>; + +// INSERT_get_vinsert128_imm xform function: convert insert_subvector index to +// VINSERTF128/VINSERTI128 imm. +def INSERT_get_vinsert128_imm : SDNodeXForm<insert_subvector, [{ + return getInsertVINSERTImmediate(N, 128, SDLoc(N)); +}]>; + +// EXTRACT_get_vextract256_imm xform function: convert extract_subvector index +// to VEXTRACTF64x4 imm. +def EXTRACT_get_vextract256_imm : SDNodeXForm<extract_subvector, [{ + return getExtractVEXTRACTImmediate(N, 256, SDLoc(N)); +}]>; + +// INSERT_get_vinsert256_imm xform function: convert insert_subvector index to +// VINSERTF64x4 imm. +def INSERT_get_vinsert256_imm : SDNodeXForm<insert_subvector, [{ + return getInsertVINSERTImmediate(N, 256, SDLoc(N)); +}]>; + +def vextract128_extract : PatFrag<(ops node:$bigvec, node:$index), + (extract_subvector node:$bigvec, + node:$index), [{}], + EXTRACT_get_vextract128_imm>; + +def vinsert128_insert : PatFrag<(ops node:$bigvec, node:$smallvec, + node:$index), + (insert_subvector node:$bigvec, node:$smallvec, + node:$index), [{}], + INSERT_get_vinsert128_imm>; + +def vextract256_extract : PatFrag<(ops node:$bigvec, node:$index), + (extract_subvector node:$bigvec, + node:$index), [{}], + EXTRACT_get_vextract256_imm>; + +def vinsert256_insert : PatFrag<(ops node:$bigvec, node:$smallvec, + node:$index), + (insert_subvector node:$bigvec, node:$smallvec, + node:$index), [{}], + INSERT_get_vinsert256_imm>; + +def X86mload : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_load node:$src1, node:$src2, node:$src3), [{ + return !cast<MaskedLoadSDNode>(N)->isExpandingLoad() && + cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD; +}]>; + +def masked_load_aligned128 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86mload node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedLoadSDNode>(N)->getAlignment() >= 16; +}]>; + +def masked_load_aligned256 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86mload node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedLoadSDNode>(N)->getAlignment() >= 32; +}]>; + +def masked_load_aligned512 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86mload node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedLoadSDNode>(N)->getAlignment() >= 64; +}]>; + +def masked_load_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_load node:$src1, node:$src2, node:$src3), [{ + return !cast<MaskedLoadSDNode>(N)->isExpandingLoad() && + cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD; +}]>; + +def X86mExpandingLoad : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_load node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedLoadSDNode>(N)->isExpandingLoad(); +}]>; + +// Masked store fragments. +// X86mstore can't be implemented in core DAG files because some targets +// do not support vector types (llvm-tblgen will fail). +def X86mstore : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_store node:$src1, node:$src2, node:$src3), [{ + return (!cast<MaskedStoreSDNode>(N)->isTruncatingStore()) && + (!cast<MaskedStoreSDNode>(N)->isCompressingStore()); +}]>; + +def masked_store_aligned128 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86mstore node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedStoreSDNode>(N)->getAlignment() >= 16; +}]>; + +def masked_store_aligned256 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86mstore node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedStoreSDNode>(N)->getAlignment() >= 32; +}]>; + +def masked_store_aligned512 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86mstore node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedStoreSDNode>(N)->getAlignment() >= 64; +}]>; + +def masked_store_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_store node:$src1, node:$src2, node:$src3), [{ + return (!cast<MaskedStoreSDNode>(N)->isTruncatingStore()) && + (!cast<MaskedStoreSDNode>(N)->isCompressingStore()); +}]>; + +def X86mCompressingStore : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_store node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedStoreSDNode>(N)->isCompressingStore(); +}]>; + +// masked truncstore fragments +// X86mtruncstore can't be implemented in core DAG files because some targets +// doesn't support vector type ( llvm-tblgen will fail) +def X86mtruncstore : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_store node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedStoreSDNode>(N)->isTruncatingStore(); +}]>; +def masked_truncstorevi8 : + PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86mtruncstore node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8; +}]>; +def masked_truncstorevi16 : + PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86mtruncstore node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16; +}]>; +def masked_truncstorevi32 : + PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86mtruncstore node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32; +}]>; + +def X86TruncSStore : SDNode<"X86ISD::VTRUNCSTORES", SDTStore, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; + +def X86TruncUSStore : SDNode<"X86ISD::VTRUNCSTOREUS", SDTStore, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; + +def X86MTruncSStore : SDNode<"X86ISD::VMTRUNCSTORES", SDTMaskedStore, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; + +def X86MTruncUSStore : SDNode<"X86ISD::VMTRUNCSTOREUS", SDTMaskedStore, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; + +def truncstore_s_vi8 : PatFrag<(ops node:$val, node:$ptr), + (X86TruncSStore node:$val, node:$ptr), [{ + return cast<TruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8; +}]>; + +def truncstore_us_vi8 : PatFrag<(ops node:$val, node:$ptr), + (X86TruncUSStore node:$val, node:$ptr), [{ + return cast<TruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8; +}]>; + +def truncstore_s_vi16 : PatFrag<(ops node:$val, node:$ptr), + (X86TruncSStore node:$val, node:$ptr), [{ + return cast<TruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16; +}]>; + +def truncstore_us_vi16 : PatFrag<(ops node:$val, node:$ptr), + (X86TruncUSStore node:$val, node:$ptr), [{ + return cast<TruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16; +}]>; + +def truncstore_s_vi32 : PatFrag<(ops node:$val, node:$ptr), + (X86TruncSStore node:$val, node:$ptr), [{ + return cast<TruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32; +}]>; + +def truncstore_us_vi32 : PatFrag<(ops node:$val, node:$ptr), + (X86TruncUSStore node:$val, node:$ptr), [{ + return cast<TruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32; +}]>; + +def masked_truncstore_s_vi8 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86MTruncSStore node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedTruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8; +}]>; + +def masked_truncstore_us_vi8 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86MTruncUSStore node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedTruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8; +}]>; + +def masked_truncstore_s_vi16 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86MTruncSStore node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedTruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16; +}]>; + +def masked_truncstore_us_vi16 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86MTruncUSStore node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedTruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16; +}]>; + +def masked_truncstore_s_vi32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86MTruncSStore node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedTruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32; +}]>; + +def masked_truncstore_us_vi32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86MTruncUSStore node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedTruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32; +}]>; diff --git a/capstone/suite/synctools/tablegen/X86/back/X86InstrInfo.td b/capstone/suite/synctools/tablegen/X86/back/X86InstrInfo.td new file mode 100644 index 000000000..b43ea8ff2 --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86InstrInfo.td @@ -0,0 +1,3580 @@ +//===-- X86InstrInfo.td - Main X86 Instruction Definition --*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 instruction set, defining the instructions, and +// properties of the instructions which are needed for code generation, machine +// code emission, and analysis. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// X86 specific DAG Nodes. +// + +def SDTIntShiftDOp: SDTypeProfile<1, 3, + [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, + SDTCisInt<0>, SDTCisInt<3>]>; + +def SDTX86CmpTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisSameAs<1, 2>]>; + +def SDTX86Cmps : SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; +//def SDTX86Cmpss : SDTypeProfile<1, 3, [SDTCisVT<0, f32>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; + +def SDTX86Cmov : SDTypeProfile<1, 4, + [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, + SDTCisVT<3, i8>, SDTCisVT<4, i32>]>; + +// Unary and binary operator instructions that set EFLAGS as a side-effect. +def SDTUnaryArithWithFlags : SDTypeProfile<2, 1, + [SDTCisSameAs<0, 2>, + SDTCisInt<0>, SDTCisVT<1, i32>]>; + +def SDTBinaryArithWithFlags : SDTypeProfile<2, 2, + [SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>, + SDTCisInt<0>, SDTCisVT<1, i32>]>; + +// SDTBinaryArithWithFlagsInOut - RES1, EFLAGS = op LHS, RHS, EFLAGS +def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3, + [SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>, + SDTCisInt<0>, + SDTCisVT<1, i32>, + SDTCisVT<4, i32>]>; +// RES1, RES2, FLAGS = op LHS, RHS +def SDT2ResultBinaryArithWithFlags : SDTypeProfile<3, 2, + [SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>, + SDTCisInt<0>, SDTCisVT<1, i32>]>; +def SDTX86BrCond : SDTypeProfile<0, 3, + [SDTCisVT<0, OtherVT>, + SDTCisVT<1, i8>, SDTCisVT<2, i32>]>; + +def SDTX86SetCC : SDTypeProfile<1, 2, + [SDTCisVT<0, i8>, + SDTCisVT<1, i8>, SDTCisVT<2, i32>]>; +def SDTX86SetCC_C : SDTypeProfile<1, 2, + [SDTCisInt<0>, + SDTCisVT<1, i8>, SDTCisVT<2, i32>]>; + +def SDTX86sahf : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVT<1, i8>]>; + +def SDTX86rdrand : SDTypeProfile<2, 0, [SDTCisInt<0>, SDTCisVT<1, i32>]>; + +def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>, + SDTCisVT<2, i8>]>; +def SDTX86caspair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; +def SDTX86caspairSaveEbx8 : SDTypeProfile<1, 3, + [SDTCisVT<0, i32>, SDTCisPtrTy<1>, + SDTCisVT<2, i32>, SDTCisVT<3, i32>]>; +def SDTX86caspairSaveRbx16 : SDTypeProfile<1, 3, + [SDTCisVT<0, i64>, SDTCisPtrTy<1>, + SDTCisVT<2, i64>, SDTCisVT<3, i64>]>; + +def SDTLockBinaryArithWithFlags : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, + SDTCisPtrTy<1>, + SDTCisInt<2>]>; + +def SDTLockUnaryArithWithFlags : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, + SDTCisPtrTy<1>]>; + +def SDTX86Ret : SDTypeProfile<0, -1, [SDTCisVT<0, i32>]>; + +def SDT_X86CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>, + SDTCisVT<1, i32>]>; +def SDT_X86CallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, + SDTCisVT<1, i32>]>; + +def SDT_X86Call : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>; + +def SDT_X86NtBrind : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>; + +def SDT_X86VASTART_SAVE_XMM_REGS : SDTypeProfile<0, -1, [SDTCisVT<0, i8>, + SDTCisVT<1, iPTR>, + SDTCisVT<2, iPTR>]>; + +def SDT_X86VAARG_64 : SDTypeProfile<1, -1, [SDTCisPtrTy<0>, + SDTCisPtrTy<1>, + SDTCisVT<2, i32>, + SDTCisVT<3, i8>, + SDTCisVT<4, i32>]>; + +def SDTX86RepStr : SDTypeProfile<0, 1, [SDTCisVT<0, OtherVT>]>; + +def SDTX86Void : SDTypeProfile<0, 0, []>; + +def SDTX86Wrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>; + +def SDT_X86TLSADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>; + +def SDT_X86TLSBASEADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>; + +def SDT_X86TLSCALL : SDTypeProfile<0, 1, [SDTCisInt<0>]>; + +def SDT_X86WIN_ALLOCA : SDTypeProfile<0, 1, [SDTCisVT<0, iPTR>]>; + +def SDT_X86SEG_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>; + +def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>; + +def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>; + +def SDT_X86MEMBARRIER : SDTypeProfile<0, 0, []>; + +def X86MemBarrier : SDNode<"X86ISD::MEMBARRIER", SDT_X86MEMBARRIER, + [SDNPHasChain,SDNPSideEffect]>; +def X86MFence : SDNode<"X86ISD::MFENCE", SDT_X86MEMBARRIER, + [SDNPHasChain]>; + + +def X86bsf : SDNode<"X86ISD::BSF", SDTUnaryArithWithFlags>; +def X86bsr : SDNode<"X86ISD::BSR", SDTUnaryArithWithFlags>; +def X86shld : SDNode<"X86ISD::SHLD", SDTIntShiftDOp>; +def X86shrd : SDNode<"X86ISD::SHRD", SDTIntShiftDOp>; + +def X86cmp : SDNode<"X86ISD::CMP" , SDTX86CmpTest>; +def X86bt : SDNode<"X86ISD::BT", SDTX86CmpTest>; + +def X86cmov : SDNode<"X86ISD::CMOV", SDTX86Cmov>; +def X86brcond : SDNode<"X86ISD::BRCOND", SDTX86BrCond, + [SDNPHasChain]>; +def X86setcc : SDNode<"X86ISD::SETCC", SDTX86SetCC>; +def X86setcc_c : SDNode<"X86ISD::SETCC_CARRY", SDTX86SetCC_C>; + +def X86sahf : SDNode<"X86ISD::SAHF", SDTX86sahf>; + +def X86rdrand : SDNode<"X86ISD::RDRAND", SDTX86rdrand, + [SDNPHasChain, SDNPSideEffect]>; + +def X86rdseed : SDNode<"X86ISD::RDSEED", SDTX86rdrand, + [SDNPHasChain, SDNPSideEffect]>; + +def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, + SDNPMayLoad, SDNPMemOperand]>; +def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86caspair, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, + SDNPMayLoad, SDNPMemOperand]>; +def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86caspair, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, + SDNPMayLoad, SDNPMemOperand]>; +def X86cas8save_ebx : SDNode<"X86ISD::LCMPXCHG8_SAVE_EBX_DAG", + SDTX86caspairSaveEbx8, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, + SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; +def X86cas16save_rbx : SDNode<"X86ISD::LCMPXCHG16_SAVE_RBX_DAG", + SDTX86caspairSaveRbx16, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, + SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; + +def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret, + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; +def X86iret : SDNode<"X86ISD::IRET", SDTX86Ret, + [SDNPHasChain, SDNPOptInGlue]>; + +def X86vastart_save_xmm_regs : + SDNode<"X86ISD::VASTART_SAVE_XMM_REGS", + SDT_X86VASTART_SAVE_XMM_REGS, + [SDNPHasChain, SDNPVariadic]>; +def X86vaarg64 : + SDNode<"X86ISD::VAARG_64", SDT_X86VAARG_64, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, + SDNPMemOperand]>; +def X86callseq_start : + SDNode<"ISD::CALLSEQ_START", SDT_X86CallSeqStart, + [SDNPHasChain, SDNPOutGlue]>; +def X86callseq_end : + SDNode<"ISD::CALLSEQ_END", SDT_X86CallSeqEnd, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +def X86call : SDNode<"X86ISD::CALL", SDT_X86Call, + [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue, + SDNPVariadic]>; + +def X86NoTrackCall : SDNode<"X86ISD::NT_CALL", SDT_X86Call, + [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue, + SDNPVariadic]>; +def X86NoTrackBrind : SDNode<"X86ISD::NT_BRIND", SDT_X86NtBrind, + [SDNPHasChain]>; + +def X86rep_stos: SDNode<"X86ISD::REP_STOS", SDTX86RepStr, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore]>; +def X86rep_movs: SDNode<"X86ISD::REP_MOVS", SDTX86RepStr, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, + SDNPMayLoad]>; + +def X86rdtsc : SDNode<"X86ISD::RDTSC_DAG", SDTX86Void, + [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; +def X86rdtscp : SDNode<"X86ISD::RDTSCP_DAG", SDTX86Void, + [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; +def X86rdpmc : SDNode<"X86ISD::RDPMC_DAG", SDTX86Void, + [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; + +def X86Wrapper : SDNode<"X86ISD::Wrapper", SDTX86Wrapper>; +def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP", SDTX86Wrapper>; + +def X86RecoverFrameAlloc : SDNode<"ISD::LOCAL_RECOVER", + SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, + SDTCisInt<1>]>>; + +def X86tlsaddr : SDNode<"X86ISD::TLSADDR", SDT_X86TLSADDR, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +def X86tlsbaseaddr : SDNode<"X86ISD::TLSBASEADDR", SDT_X86TLSBASEADDR, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +def X86ehret : SDNode<"X86ISD::EH_RETURN", SDT_X86EHRET, + [SDNPHasChain]>; + +def X86eh_sjlj_setjmp : SDNode<"X86ISD::EH_SJLJ_SETJMP", + SDTypeProfile<1, 1, [SDTCisInt<0>, + SDTCisPtrTy<1>]>, + [SDNPHasChain, SDNPSideEffect]>; +def X86eh_sjlj_longjmp : SDNode<"X86ISD::EH_SJLJ_LONGJMP", + SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>, + [SDNPHasChain, SDNPSideEffect]>; +def X86eh_sjlj_setup_dispatch : SDNode<"X86ISD::EH_SJLJ_SETUP_DISPATCH", + SDTypeProfile<0, 0, []>, + [SDNPHasChain, SDNPSideEffect]>; + +def X86tcret : SDNode<"X86ISD::TC_RETURN", SDT_X86TCRET, + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; + +def X86add_flag : SDNode<"X86ISD::ADD", SDTBinaryArithWithFlags, + [SDNPCommutative]>; +def X86sub_flag : SDNode<"X86ISD::SUB", SDTBinaryArithWithFlags>; +def X86smul_flag : SDNode<"X86ISD::SMUL", SDTBinaryArithWithFlags, + [SDNPCommutative]>; +def X86umul_flag : SDNode<"X86ISD::UMUL", SDT2ResultBinaryArithWithFlags, + [SDNPCommutative]>; +def X86adc_flag : SDNode<"X86ISD::ADC", SDTBinaryArithWithFlagsInOut>; +def X86sbb_flag : SDNode<"X86ISD::SBB", SDTBinaryArithWithFlagsInOut>; + +def X86inc_flag : SDNode<"X86ISD::INC", SDTUnaryArithWithFlags>; +def X86dec_flag : SDNode<"X86ISD::DEC", SDTUnaryArithWithFlags>; +def X86or_flag : SDNode<"X86ISD::OR", SDTBinaryArithWithFlags, + [SDNPCommutative]>; +def X86xor_flag : SDNode<"X86ISD::XOR", SDTBinaryArithWithFlags, + [SDNPCommutative]>; +def X86and_flag : SDNode<"X86ISD::AND", SDTBinaryArithWithFlags, + [SDNPCommutative]>; + +def X86lock_add : SDNode<"X86ISD::LADD", SDTLockBinaryArithWithFlags, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, + SDNPMemOperand]>; +def X86lock_sub : SDNode<"X86ISD::LSUB", SDTLockBinaryArithWithFlags, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, + SDNPMemOperand]>; +def X86lock_or : SDNode<"X86ISD::LOR", SDTLockBinaryArithWithFlags, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, + SDNPMemOperand]>; +def X86lock_xor : SDNode<"X86ISD::LXOR", SDTLockBinaryArithWithFlags, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, + SDNPMemOperand]>; +def X86lock_and : SDNode<"X86ISD::LAND", SDTLockBinaryArithWithFlags, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, + SDNPMemOperand]>; + +def X86lock_inc : SDNode<"X86ISD::LINC", SDTLockUnaryArithWithFlags, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, + SDNPMemOperand]>; +def X86lock_dec : SDNode<"X86ISD::LDEC", SDTLockUnaryArithWithFlags, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, + SDNPMemOperand]>; + +def X86bextr : SDNode<"X86ISD::BEXTR", SDTIntBinOp>; + +def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>; + +def X86WinAlloca : SDNode<"X86ISD::WIN_ALLOCA", SDT_X86WIN_ALLOCA, + [SDNPHasChain, SDNPOutGlue]>; + +def X86SegAlloca : SDNode<"X86ISD::SEG_ALLOCA", SDT_X86SEG_ALLOCA, + [SDNPHasChain]>; + +def X86TLSCall : SDNode<"X86ISD::TLSCALL", SDT_X86TLSCALL, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +def X86lwpins : SDNode<"X86ISD::LWPINS", + SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisInt<1>, + SDTCisVT<2, i32>, SDTCisVT<3, i32>]>, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPSideEffect]>; + +def X86umwait : SDNode<"X86ISD::UMWAIT", + SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisInt<1>, + SDTCisVT<2, i32>, SDTCisVT<3, i32>]>, + [SDNPHasChain, SDNPSideEffect]>; + +def X86tpause : SDNode<"X86ISD::TPAUSE", + SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisInt<1>, + SDTCisVT<2, i32>, SDTCisVT<3, i32>]>, + [SDNPHasChain, SDNPSideEffect]>; + +//===----------------------------------------------------------------------===// +// X86 Operand Definitions. +// + +// A version of ptr_rc which excludes SP, ESP, and RSP. This is used for +// the index operand of an address, to conform to x86 encoding restrictions. +def ptr_rc_nosp : PointerLikeRegClass<1>; + +// *mem - Operand definitions for the funky X86 addressing mode operands. +// +def X86MemAsmOperand : AsmOperandClass { + let Name = "Mem"; +} +let RenderMethod = "addMemOperands", SuperClasses = [X86MemAsmOperand] in { + def X86Mem8AsmOperand : AsmOperandClass { let Name = "Mem8"; } + def X86Mem16AsmOperand : AsmOperandClass { let Name = "Mem16"; } + def X86Mem32AsmOperand : AsmOperandClass { let Name = "Mem32"; } + def X86Mem64AsmOperand : AsmOperandClass { let Name = "Mem64"; } + def X86Mem80AsmOperand : AsmOperandClass { let Name = "Mem80"; } + def X86Mem128AsmOperand : AsmOperandClass { let Name = "Mem128"; } + def X86Mem256AsmOperand : AsmOperandClass { let Name = "Mem256"; } + def X86Mem512AsmOperand : AsmOperandClass { let Name = "Mem512"; } + // Gather mem operands + def X86Mem64_RC128Operand : AsmOperandClass { let Name = "Mem64_RC128"; } + def X86Mem128_RC128Operand : AsmOperandClass { let Name = "Mem128_RC128"; } + def X86Mem256_RC128Operand : AsmOperandClass { let Name = "Mem256_RC128"; } + def X86Mem128_RC256Operand : AsmOperandClass { let Name = "Mem128_RC256"; } + def X86Mem256_RC256Operand : AsmOperandClass { let Name = "Mem256_RC256"; } + + def X86Mem64_RC128XOperand : AsmOperandClass { let Name = "Mem64_RC128X"; } + def X86Mem128_RC128XOperand : AsmOperandClass { let Name = "Mem128_RC128X"; } + def X86Mem256_RC128XOperand : AsmOperandClass { let Name = "Mem256_RC128X"; } + def X86Mem128_RC256XOperand : AsmOperandClass { let Name = "Mem128_RC256X"; } + def X86Mem256_RC256XOperand : AsmOperandClass { let Name = "Mem256_RC256X"; } + def X86Mem512_RC256XOperand : AsmOperandClass { let Name = "Mem512_RC256X"; } + def X86Mem256_RC512Operand : AsmOperandClass { let Name = "Mem256_RC512"; } + def X86Mem512_RC512Operand : AsmOperandClass { let Name = "Mem512_RC512"; } +} + +def X86AbsMemAsmOperand : AsmOperandClass { + let Name = "AbsMem"; + let SuperClasses = [X86MemAsmOperand]; +} + +class X86MemOperand<string printMethod, + AsmOperandClass parserMatchClass = X86MemAsmOperand> : Operand<iPTR> { + let PrintMethod = printMethod; + let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, SEGMENT_REG); + let ParserMatchClass = parserMatchClass; + let OperandType = "OPERAND_MEMORY"; +} + +// Gather mem operands +class X86VMemOperand<RegisterClass RC, string printMethod, + AsmOperandClass parserMatchClass> + : X86MemOperand<printMethod, parserMatchClass> { + let MIOperandInfo = (ops ptr_rc, i8imm, RC, i32imm, SEGMENT_REG); +} + +def anymem : X86MemOperand<"printanymem">; + +// FIXME: Right now we allow any size during parsing, but we might want to +// restrict to only unsized memory. +def opaquemem : X86MemOperand<"printopaquemem">; + +def i8mem : X86MemOperand<"printi8mem", X86Mem8AsmOperand>; +def i16mem : X86MemOperand<"printi16mem", X86Mem16AsmOperand>; +def i32mem : X86MemOperand<"printi32mem", X86Mem32AsmOperand>; +def i64mem : X86MemOperand<"printi64mem", X86Mem64AsmOperand>; +def i128mem : X86MemOperand<"printi128mem", X86Mem128AsmOperand>; +def i256mem : X86MemOperand<"printi256mem", X86Mem256AsmOperand>; +def i512mem : X86MemOperand<"printi512mem", X86Mem512AsmOperand>; +def f32mem : X86MemOperand<"printf32mem", X86Mem32AsmOperand>; +def f64mem : X86MemOperand<"printf64mem", X86Mem64AsmOperand>; +def f80mem : X86MemOperand<"printf80mem", X86Mem80AsmOperand>; +def f128mem : X86MemOperand<"printf128mem", X86Mem128AsmOperand>; +def f256mem : X86MemOperand<"printf256mem", X86Mem256AsmOperand>; +def f512mem : X86MemOperand<"printf512mem", X86Mem512AsmOperand>; + +def v512mem : X86VMemOperand<VR512, "printf512mem", X86Mem512AsmOperand>; + +// Gather mem operands +def vx64mem : X86VMemOperand<VR128, "printi64mem", X86Mem64_RC128Operand>; +def vx128mem : X86VMemOperand<VR128, "printi128mem", X86Mem128_RC128Operand>; +def vx256mem : X86VMemOperand<VR128, "printi256mem", X86Mem256_RC128Operand>; +def vy128mem : X86VMemOperand<VR256, "printi128mem", X86Mem128_RC256Operand>; +def vy256mem : X86VMemOperand<VR256, "printi256mem", X86Mem256_RC256Operand>; + +def vx64xmem : X86VMemOperand<VR128X, "printi64mem", X86Mem64_RC128XOperand>; +def vx128xmem : X86VMemOperand<VR128X, "printi128mem", X86Mem128_RC128XOperand>; +def vx256xmem : X86VMemOperand<VR128X, "printi256mem", X86Mem256_RC128XOperand>; +def vy128xmem : X86VMemOperand<VR256X, "printi128mem", X86Mem128_RC256XOperand>; +def vy256xmem : X86VMemOperand<VR256X, "printi256mem", X86Mem256_RC256XOperand>; +def vy512xmem : X86VMemOperand<VR256X, "printi512mem", X86Mem512_RC256XOperand>; +def vz256mem : X86VMemOperand<VR512, "printi256mem", X86Mem256_RC512Operand>; +def vz512mem : X86VMemOperand<VR512, "printi512mem", X86Mem512_RC512Operand>; + +// A version of i8mem for use on x86-64 and x32 that uses a NOREX GPR instead +// of a plain GPR, so that it doesn't potentially require a REX prefix. +def ptr_rc_norex : PointerLikeRegClass<2>; +def ptr_rc_norex_nosp : PointerLikeRegClass<3>; + +def i8mem_NOREX : Operand<iPTR> { + let PrintMethod = "printi8mem"; + let MIOperandInfo = (ops ptr_rc_norex, i8imm, ptr_rc_norex_nosp, i32imm, + SEGMENT_REG); + let ParserMatchClass = X86Mem8AsmOperand; + let OperandType = "OPERAND_MEMORY"; +} + +// GPRs available for tailcall. +// It represents GR32_TC, GR64_TC or GR64_TCW64. +def ptr_rc_tailcall : PointerLikeRegClass<4>; + +// Special i32mem for addresses of load folding tail calls. These are not +// allowed to use callee-saved registers since they must be scheduled +// after callee-saved register are popped. +def i32mem_TC : Operand<i32> { + let PrintMethod = "printi32mem"; + let MIOperandInfo = (ops ptr_rc_tailcall, i8imm, ptr_rc_tailcall, + i32imm, SEGMENT_REG); + let ParserMatchClass = X86Mem32AsmOperand; + let OperandType = "OPERAND_MEMORY"; +} + +// Special i64mem for addresses of load folding tail calls. These are not +// allowed to use callee-saved registers since they must be scheduled +// after callee-saved register are popped. +def i64mem_TC : Operand<i64> { + let PrintMethod = "printi64mem"; + let MIOperandInfo = (ops ptr_rc_tailcall, i8imm, + ptr_rc_tailcall, i32imm, SEGMENT_REG); + let ParserMatchClass = X86Mem64AsmOperand; + let OperandType = "OPERAND_MEMORY"; +} + +let OperandType = "OPERAND_PCREL", + ParserMatchClass = X86AbsMemAsmOperand, + PrintMethod = "printPCRelImm" in { +def i32imm_pcrel : Operand<i32>; +def i16imm_pcrel : Operand<i16>; + +// Branch targets have OtherVT type and print as pc-relative values. +def brtarget : Operand<OtherVT>; +def brtarget8 : Operand<OtherVT>; + +} + +// Special parser to detect 16-bit mode to select 16-bit displacement. +def X86AbsMem16AsmOperand : AsmOperandClass { + let Name = "AbsMem16"; + let RenderMethod = "addAbsMemOperands"; + let SuperClasses = [X86AbsMemAsmOperand]; +} + +// Branch targets have OtherVT type and print as pc-relative values. +let OperandType = "OPERAND_PCREL", + PrintMethod = "printPCRelImm" in { +let ParserMatchClass = X86AbsMem16AsmOperand in + def brtarget16 : Operand<OtherVT>; +let ParserMatchClass = X86AbsMemAsmOperand in + def brtarget32 : Operand<OtherVT>; +} + +let RenderMethod = "addSrcIdxOperands" in { + def X86SrcIdx8Operand : AsmOperandClass { + let Name = "SrcIdx8"; + let SuperClasses = [X86Mem8AsmOperand]; + } + def X86SrcIdx16Operand : AsmOperandClass { + let Name = "SrcIdx16"; + let SuperClasses = [X86Mem16AsmOperand]; + } + def X86SrcIdx32Operand : AsmOperandClass { + let Name = "SrcIdx32"; + let SuperClasses = [X86Mem32AsmOperand]; + } + def X86SrcIdx64Operand : AsmOperandClass { + let Name = "SrcIdx64"; + let SuperClasses = [X86Mem64AsmOperand]; + } +} // RenderMethod = "addSrcIdxOperands" + +let RenderMethod = "addDstIdxOperands" in { + def X86DstIdx8Operand : AsmOperandClass { + let Name = "DstIdx8"; + let SuperClasses = [X86Mem8AsmOperand]; + } + def X86DstIdx16Operand : AsmOperandClass { + let Name = "DstIdx16"; + let SuperClasses = [X86Mem16AsmOperand]; + } + def X86DstIdx32Operand : AsmOperandClass { + let Name = "DstIdx32"; + let SuperClasses = [X86Mem32AsmOperand]; + } + def X86DstIdx64Operand : AsmOperandClass { + let Name = "DstIdx64"; + let SuperClasses = [X86Mem64AsmOperand]; + } +} // RenderMethod = "addDstIdxOperands" + +let RenderMethod = "addMemOffsOperands" in { + def X86MemOffs16_8AsmOperand : AsmOperandClass { + let Name = "MemOffs16_8"; + let SuperClasses = [X86Mem8AsmOperand]; + } + def X86MemOffs16_16AsmOperand : AsmOperandClass { + let Name = "MemOffs16_16"; + let SuperClasses = [X86Mem16AsmOperand]; + } + def X86MemOffs16_32AsmOperand : AsmOperandClass { + let Name = "MemOffs16_32"; + let SuperClasses = [X86Mem32AsmOperand]; + } + def X86MemOffs32_8AsmOperand : AsmOperandClass { + let Name = "MemOffs32_8"; + let SuperClasses = [X86Mem8AsmOperand]; + } + def X86MemOffs32_16AsmOperand : AsmOperandClass { + let Name = "MemOffs32_16"; + let SuperClasses = [X86Mem16AsmOperand]; + } + def X86MemOffs32_32AsmOperand : AsmOperandClass { + let Name = "MemOffs32_32"; + let SuperClasses = [X86Mem32AsmOperand]; + } + def X86MemOffs32_64AsmOperand : AsmOperandClass { + let Name = "MemOffs32_64"; + let SuperClasses = [X86Mem64AsmOperand]; + } + def X86MemOffs64_8AsmOperand : AsmOperandClass { + let Name = "MemOffs64_8"; + let SuperClasses = [X86Mem8AsmOperand]; + } + def X86MemOffs64_16AsmOperand : AsmOperandClass { + let Name = "MemOffs64_16"; + let SuperClasses = [X86Mem16AsmOperand]; + } + def X86MemOffs64_32AsmOperand : AsmOperandClass { + let Name = "MemOffs64_32"; + let SuperClasses = [X86Mem32AsmOperand]; + } + def X86MemOffs64_64AsmOperand : AsmOperandClass { + let Name = "MemOffs64_64"; + let SuperClasses = [X86Mem64AsmOperand]; + } +} // RenderMethod = "addMemOffsOperands" + +class X86SrcIdxOperand<string printMethod, AsmOperandClass parserMatchClass> + : X86MemOperand<printMethod, parserMatchClass> { + let MIOperandInfo = (ops ptr_rc, SEGMENT_REG); +} + +class X86DstIdxOperand<string printMethod, AsmOperandClass parserMatchClass> + : X86MemOperand<printMethod, parserMatchClass> { + let MIOperandInfo = (ops ptr_rc); +} + +def srcidx8 : X86SrcIdxOperand<"printSrcIdx8", X86SrcIdx8Operand>; +def srcidx16 : X86SrcIdxOperand<"printSrcIdx16", X86SrcIdx16Operand>; +def srcidx32 : X86SrcIdxOperand<"printSrcIdx32", X86SrcIdx32Operand>; +def srcidx64 : X86SrcIdxOperand<"printSrcIdx64", X86SrcIdx64Operand>; +def dstidx8 : X86DstIdxOperand<"printDstIdx8", X86DstIdx8Operand>; +def dstidx16 : X86DstIdxOperand<"printDstIdx16", X86DstIdx16Operand>; +def dstidx32 : X86DstIdxOperand<"printDstIdx32", X86DstIdx32Operand>; +def dstidx64 : X86DstIdxOperand<"printDstIdx64", X86DstIdx64Operand>; + +class X86MemOffsOperand<Operand immOperand, string printMethod, + AsmOperandClass parserMatchClass> + : X86MemOperand<printMethod, parserMatchClass> { + let MIOperandInfo = (ops immOperand, SEGMENT_REG); +} + +def offset16_8 : X86MemOffsOperand<i16imm, "printMemOffs8", + X86MemOffs16_8AsmOperand>; +def offset16_16 : X86MemOffsOperand<i16imm, "printMemOffs16", + X86MemOffs16_16AsmOperand>; +def offset16_32 : X86MemOffsOperand<i16imm, "printMemOffs32", + X86MemOffs16_32AsmOperand>; +def offset32_8 : X86MemOffsOperand<i32imm, "printMemOffs8", + X86MemOffs32_8AsmOperand>; +def offset32_16 : X86MemOffsOperand<i32imm, "printMemOffs16", + X86MemOffs32_16AsmOperand>; +def offset32_32 : X86MemOffsOperand<i32imm, "printMemOffs32", + X86MemOffs32_32AsmOperand>; +def offset32_64 : X86MemOffsOperand<i32imm, "printMemOffs64", + X86MemOffs32_64AsmOperand>; +def offset64_8 : X86MemOffsOperand<i64imm, "printMemOffs8", + X86MemOffs64_8AsmOperand>; +def offset64_16 : X86MemOffsOperand<i64imm, "printMemOffs16", + X86MemOffs64_16AsmOperand>; +def offset64_32 : X86MemOffsOperand<i64imm, "printMemOffs32", + X86MemOffs64_32AsmOperand>; +def offset64_64 : X86MemOffsOperand<i64imm, "printMemOffs64", + X86MemOffs64_64AsmOperand>; + +def SSECC : Operand<i8> { + let PrintMethod = "printSSEAVXCC"; + let OperandType = "OPERAND_IMMEDIATE"; +} + +def AVXCC : Operand<i8> { + let PrintMethod = "printSSEAVXCC"; + let OperandType = "OPERAND_IMMEDIATE"; +} + +def AVX512ICC : Operand<i8> { + let PrintMethod = "printSSEAVXCC"; + let OperandType = "OPERAND_IMMEDIATE"; +} + +def XOPCC : Operand<i8> { + let PrintMethod = "printXOPCC"; + let OperandType = "OPERAND_IMMEDIATE"; +} + +class ImmSExtAsmOperandClass : AsmOperandClass { + let SuperClasses = [ImmAsmOperand]; + let RenderMethod = "addImmOperands"; +} + +def X86GR32orGR64AsmOperand : AsmOperandClass { + let Name = "GR32orGR64"; +} + +def GR32orGR64 : RegisterOperand<GR32> { + let ParserMatchClass = X86GR32orGR64AsmOperand; +} +def AVX512RCOperand : AsmOperandClass { + let Name = "AVX512RC"; +} +def AVX512RC : Operand<i32> { + let PrintMethod = "printRoundingControl"; + let OperandType = "OPERAND_IMMEDIATE"; + let ParserMatchClass = AVX512RCOperand; +} + +// Sign-extended immediate classes. We don't need to define the full lattice +// here because there is no instruction with an ambiguity between ImmSExti64i32 +// and ImmSExti32i8. +// +// The strange ranges come from the fact that the assembler always works with +// 64-bit immediates, but for a 16-bit target value we want to accept both "-1" +// (which will be a -1ULL), and "0xFF" (-1 in 16-bits). + +// [0, 0x7FFFFFFF] | +// [0xFFFFFFFF80000000, 0xFFFFFFFFFFFFFFFF] +def ImmSExti64i32AsmOperand : ImmSExtAsmOperandClass { + let Name = "ImmSExti64i32"; +} + +// [0, 0x0000007F] | [0x000000000000FF80, 0x000000000000FFFF] | +// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF] +def ImmSExti16i8AsmOperand : ImmSExtAsmOperandClass { + let Name = "ImmSExti16i8"; + let SuperClasses = [ImmSExti64i32AsmOperand]; +} + +// [0, 0x0000007F] | [0x00000000FFFFFF80, 0x00000000FFFFFFFF] | +// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF] +def ImmSExti32i8AsmOperand : ImmSExtAsmOperandClass { + let Name = "ImmSExti32i8"; +} + +// [0, 0x0000007F] | +// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF] +def ImmSExti64i8AsmOperand : ImmSExtAsmOperandClass { + let Name = "ImmSExti64i8"; + let SuperClasses = [ImmSExti16i8AsmOperand, ImmSExti32i8AsmOperand, + ImmSExti64i32AsmOperand]; +} + +// Unsigned immediate used by SSE/AVX instructions +// [0, 0xFF] +// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF] +def ImmUnsignedi8AsmOperand : AsmOperandClass { + let Name = "ImmUnsignedi8"; + let RenderMethod = "addImmOperands"; +} + +// A couple of more descriptive operand definitions. +// 16-bits but only 8 bits are significant. +def i16i8imm : Operand<i16> { + let ParserMatchClass = ImmSExti16i8AsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; +} +// 32-bits but only 8 bits are significant. +def i32i8imm : Operand<i32> { + let ParserMatchClass = ImmSExti32i8AsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; +} + +// 64-bits but only 32 bits are significant. +def i64i32imm : Operand<i64> { + let ParserMatchClass = ImmSExti64i32AsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; +} + +// 64-bits but only 8 bits are significant. +def i64i8imm : Operand<i64> { + let ParserMatchClass = ImmSExti64i8AsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; +} + +// Unsigned 8-bit immediate used by SSE/AVX instructions. +def u8imm : Operand<i8> { + let PrintMethod = "printU8Imm"; + let ParserMatchClass = ImmUnsignedi8AsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; +} + +// 32-bit immediate but only 8-bits are significant and they are unsigned. +// Used by some SSE/AVX instructions that use intrinsics. +def i32u8imm : Operand<i32> { + let PrintMethod = "printU8Imm"; + let ParserMatchClass = ImmUnsignedi8AsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; +} + +// 64-bits but only 32 bits are significant, and those bits are treated as being +// pc relative. +def i64i32imm_pcrel : Operand<i64> { + let PrintMethod = "printPCRelImm"; + let ParserMatchClass = X86AbsMemAsmOperand; + let OperandType = "OPERAND_PCREL"; +} + +def lea64_32mem : Operand<i32> { + let PrintMethod = "printanymem"; + let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, SEGMENT_REG); + let ParserMatchClass = X86MemAsmOperand; +} + +// Memory operands that use 64-bit pointers in both ILP32 and LP64. +def lea64mem : Operand<i64> { + let PrintMethod = "printanymem"; + let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, SEGMENT_REG); + let ParserMatchClass = X86MemAsmOperand; +} + + +//===----------------------------------------------------------------------===// +// X86 Complex Pattern Definitions. +// + +// Define X86-specific addressing mode. +def addr : ComplexPattern<iPTR, 5, "selectAddr", [], [SDNPWantParent]>; +def lea32addr : ComplexPattern<i32, 5, "selectLEAAddr", + [add, sub, mul, X86mul_imm, shl, or, frameindex], + []>; +// In 64-bit mode 32-bit LEAs can use RIP-relative addressing. +def lea64_32addr : ComplexPattern<i32, 5, "selectLEA64_32Addr", + [add, sub, mul, X86mul_imm, shl, or, + frameindex, X86WrapperRIP], + []>; + +def tls32addr : ComplexPattern<i32, 5, "selectTLSADDRAddr", + [tglobaltlsaddr], []>; + +def tls32baseaddr : ComplexPattern<i32, 5, "selectTLSADDRAddr", + [tglobaltlsaddr], []>; + +def lea64addr : ComplexPattern<i64, 5, "selectLEAAddr", + [add, sub, mul, X86mul_imm, shl, or, frameindex, + X86WrapperRIP], []>; + +def tls64addr : ComplexPattern<i64, 5, "selectTLSADDRAddr", + [tglobaltlsaddr], []>; + +def tls64baseaddr : ComplexPattern<i64, 5, "selectTLSADDRAddr", + [tglobaltlsaddr], []>; + +def vectoraddr : ComplexPattern<iPTR, 5, "selectVectorAddr", [],[SDNPWantParent]>; + +// A relocatable immediate is either an immediate operand or an operand that can +// be relocated by the linker to an immediate, such as a regular symbol in +// non-PIC code. +def relocImm : ComplexPattern<iAny, 1, "selectRelocImm", [imm, X86Wrapper], [], + 0>; + +//===----------------------------------------------------------------------===// +// X86 Instruction Predicate Definitions. +def TruePredicate : Predicate<"true">; + +def HasCMov : Predicate<"Subtarget->hasCMov()">; +def NoCMov : Predicate<"!Subtarget->hasCMov()">; + +def HasMMX : Predicate<"Subtarget->hasMMX()">; +def Has3DNow : Predicate<"Subtarget->has3DNow()">; +def Has3DNowA : Predicate<"Subtarget->has3DNowA()">; +def HasSSE1 : Predicate<"Subtarget->hasSSE1()">; +def UseSSE1 : Predicate<"Subtarget->hasSSE1() && !Subtarget->hasAVX()">; +def HasSSE2 : Predicate<"Subtarget->hasSSE2()">; +def UseSSE2 : Predicate<"Subtarget->hasSSE2() && !Subtarget->hasAVX()">; +def HasSSE3 : Predicate<"Subtarget->hasSSE3()">; +def UseSSE3 : Predicate<"Subtarget->hasSSE3() && !Subtarget->hasAVX()">; +def HasSSSE3 : Predicate<"Subtarget->hasSSSE3()">; +def UseSSSE3 : Predicate<"Subtarget->hasSSSE3() && !Subtarget->hasAVX()">; +def HasSSE41 : Predicate<"Subtarget->hasSSE41()">; +def NoSSE41 : Predicate<"!Subtarget->hasSSE41()">; +def UseSSE41 : Predicate<"Subtarget->hasSSE41() && !Subtarget->hasAVX()">; +def HasSSE42 : Predicate<"Subtarget->hasSSE42()">; +def UseSSE42 : Predicate<"Subtarget->hasSSE42() && !Subtarget->hasAVX()">; +def HasSSE4A : Predicate<"Subtarget->hasSSE4A()">; +def NoAVX : Predicate<"!Subtarget->hasAVX()">; +def HasAVX : Predicate<"Subtarget->hasAVX()">; +def HasAVX2 : Predicate<"Subtarget->hasAVX2()">; +def HasAVX1Only : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX2()">; +def HasAVX512 : Predicate<"Subtarget->hasAVX512()">; +def UseAVX : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX512()">; +def UseAVX2 : Predicate<"Subtarget->hasAVX2() && !Subtarget->hasAVX512()">; +def NoAVX512 : Predicate<"!Subtarget->hasAVX512()">; +def HasCDI : Predicate<"Subtarget->hasCDI()">; +def HasVPOPCNTDQ : Predicate<"Subtarget->hasVPOPCNTDQ()">; +def HasPFI : Predicate<"Subtarget->hasPFI()">; +def HasERI : Predicate<"Subtarget->hasERI()">; +def HasDQI : Predicate<"Subtarget->hasDQI()">; +def NoDQI : Predicate<"!Subtarget->hasDQI()">; +def HasBWI : Predicate<"Subtarget->hasBWI()">; +def NoBWI : Predicate<"!Subtarget->hasBWI()">; +def HasVLX : Predicate<"Subtarget->hasVLX()">; +def NoVLX : Predicate<"!Subtarget->hasVLX()">; +def NoVLX_Or_NoBWI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasBWI()">; +def NoVLX_Or_NoDQI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasDQI()">; +def PKU : Predicate<"Subtarget->hasPKU()">; +def HasVNNI : Predicate<"Subtarget->hasVNNI()">; + +def HasBITALG : Predicate<"Subtarget->hasBITALG()">; +def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">; +def HasAES : Predicate<"Subtarget->hasAES()">; +def HasVAES : Predicate<"Subtarget->hasVAES()">; +def NoVLX_Or_NoVAES : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasVAES()">; +def HasFXSR : Predicate<"Subtarget->hasFXSR()">; +def HasXSAVE : Predicate<"Subtarget->hasXSAVE()">; +def HasXSAVEOPT : Predicate<"Subtarget->hasXSAVEOPT()">; +def HasXSAVEC : Predicate<"Subtarget->hasXSAVEC()">; +def HasXSAVES : Predicate<"Subtarget->hasXSAVES()">; +def HasPCLMUL : Predicate<"Subtarget->hasPCLMUL()">; +def NoVLX_Or_NoVPCLMULQDQ : + Predicate<"!Subtarget->hasVLX() || !Subtarget->hasVPCLMULQDQ()">; +def HasVPCLMULQDQ : Predicate<"Subtarget->hasVPCLMULQDQ()">; +def HasGFNI : Predicate<"Subtarget->hasGFNI()">; +def HasFMA : Predicate<"Subtarget->hasFMA()">; +def HasFMA4 : Predicate<"Subtarget->hasFMA4()">; +def NoFMA4 : Predicate<"!Subtarget->hasFMA4()">; +def HasXOP : Predicate<"Subtarget->hasXOP()">; +def HasTBM : Predicate<"Subtarget->hasTBM()">; +def NoTBM : Predicate<"!Subtarget->hasTBM()">; +def HasLWP : Predicate<"Subtarget->hasLWP()">; +def HasMOVBE : Predicate<"Subtarget->hasMOVBE()">; +def HasRDRAND : Predicate<"Subtarget->hasRDRAND()">; +def HasF16C : Predicate<"Subtarget->hasF16C()">; +def HasFSGSBase : Predicate<"Subtarget->hasFSGSBase()">; +def HasLZCNT : Predicate<"Subtarget->hasLZCNT()">; +def HasBMI : Predicate<"Subtarget->hasBMI()">; +def HasBMI2 : Predicate<"Subtarget->hasBMI2()">; +def NoBMI2 : Predicate<"!Subtarget->hasBMI2()">; +def HasVBMI : Predicate<"Subtarget->hasVBMI()">; +def HasVBMI2 : Predicate<"Subtarget->hasVBMI2()">; +def HasIFMA : Predicate<"Subtarget->hasIFMA()">; +def HasRTM : Predicate<"Subtarget->hasRTM()">; +def HasADX : Predicate<"Subtarget->hasADX()">; +def HasSHA : Predicate<"Subtarget->hasSHA()">; +def HasSGX : Predicate<"Subtarget->hasSGX()">; +def HasPRFCHW : Predicate<"Subtarget->hasPRFCHW()">; +def HasRDSEED : Predicate<"Subtarget->hasRDSEED()">; +def HasSSEPrefetch : Predicate<"Subtarget->hasSSEPrefetch()">; +def NoSSEPrefetch : Predicate<"!Subtarget->hasSSEPrefetch()">; +def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">; +def HasPREFETCHWT1 : Predicate<"Subtarget->hasPREFETCHWT1()">; +def HasLAHFSAHF : Predicate<"Subtarget->hasLAHFSAHF()">; +def HasMWAITX : Predicate<"Subtarget->hasMWAITX()">; +def HasCLZERO : Predicate<"Subtarget->hasCLZERO()">; +def HasCLDEMOTE : Predicate<"Subtarget->hasCLDEMOTE()">; +def HasMOVDIRI : Predicate<"Subtarget->hasMOVDIRI()">; +def HasMOVDIR64B : Predicate<"Subtarget->hasMOVDIR64B()">; +def HasPTWRITE : Predicate<"Subtarget->hasPTWRITE()">; +def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">; +def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">; +def HasMPX : Predicate<"Subtarget->hasMPX()">; +def HasSHSTK : Predicate<"Subtarget->hasSHSTK()">; +def HasCLFLUSHOPT : Predicate<"Subtarget->hasCLFLUSHOPT()">; +def HasCLWB : Predicate<"Subtarget->hasCLWB()">; +def HasWBNOINVD : Predicate<"Subtarget->hasWBNOINVD()">; +def HasRDPID : Predicate<"Subtarget->hasRDPID()">; +def HasWAITPKG : Predicate<"Subtarget->hasWAITPKG()">; +def HasINVPCID : Predicate<"Subtarget->hasINVPCID()">; +def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">; +def HasPCONFIG : Predicate<"Subtarget->hasPCONFIG()">; +def Not64BitMode : Predicate<"!Subtarget->is64Bit()">, + AssemblerPredicate<"!Mode64Bit", "Not 64-bit mode">; +def In64BitMode : Predicate<"Subtarget->is64Bit()">, + AssemblerPredicate<"Mode64Bit", "64-bit mode">; +def IsLP64 : Predicate<"Subtarget->isTarget64BitLP64()">; +def NotLP64 : Predicate<"!Subtarget->isTarget64BitLP64()">; +def In16BitMode : Predicate<"Subtarget->is16Bit()">, + AssemblerPredicate<"Mode16Bit", "16-bit mode">; +def Not16BitMode : Predicate<"!Subtarget->is16Bit()">, + AssemblerPredicate<"!Mode16Bit", "Not 16-bit mode">; +def In32BitMode : Predicate<"Subtarget->is32Bit()">, + AssemblerPredicate<"Mode32Bit", "32-bit mode">; +def IsWin64 : Predicate<"Subtarget->isTargetWin64()">; +def NotWin64 : Predicate<"!Subtarget->isTargetWin64()">; +def NotWin64WithoutFP : Predicate<"!Subtarget->isTargetWin64() ||" + "Subtarget->getFrameLowering()->hasFP(*MF)"> { + let RecomputePerFunction = 1; +} +def IsPS4 : Predicate<"Subtarget->isTargetPS4()">; +def NotPS4 : Predicate<"!Subtarget->isTargetPS4()">; +def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">; +def NotNaCl : Predicate<"!Subtarget->isTargetNaCl()">; +def SmallCode : Predicate<"TM.getCodeModel() == CodeModel::Small">; +def KernelCode : Predicate<"TM.getCodeModel() == CodeModel::Kernel">; +def NearData : Predicate<"TM.getCodeModel() == CodeModel::Small ||" + "TM.getCodeModel() == CodeModel::Kernel">; +def IsNotPIC : Predicate<"!TM.isPositionIndependent()">; + +// We could compute these on a per-module basis but doing so requires accessing +// the Function object through the <Target>Subtarget and objections were raised +// to that (see post-commit review comments for r301750). +let RecomputePerFunction = 1 in { + def OptForSize : Predicate<"MF->getFunction().optForSize()">; + def OptForMinSize : Predicate<"MF->getFunction().optForMinSize()">; + def OptForSpeed : Predicate<"!MF->getFunction().optForSize()">; + def UseIncDec : Predicate<"!Subtarget->slowIncDec() || " + "MF->getFunction().optForSize()">; + def NoSSE41_Or_OptForSize : Predicate<"MF->getFunction().optForSize() || " + "!Subtarget->hasSSE41()">; +} + +def CallImmAddr : Predicate<"Subtarget->isLegalToCallImmediateAddr()">; +def FavorMemIndirectCall : Predicate<"!Subtarget->slowTwoMemOps()">; +def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">; +def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">; +def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">; +def HasERMSB : Predicate<"Subtarget->hasERMSB()">; +def HasMFence : Predicate<"Subtarget->hasMFence()">; +def UseRetpoline : Predicate<"Subtarget->useRetpoline()">; +def NotUseRetpoline : Predicate<"!Subtarget->useRetpoline()">; + +//===----------------------------------------------------------------------===// +// X86 Instruction Format Definitions. +// + +include "X86InstrFormats.td" + +//===----------------------------------------------------------------------===// +// Pattern fragments. +// + +// X86 specific condition code. These correspond to CondCode in +// X86InstrInfo.h. They must be kept in synch. +def X86_COND_A : PatLeaf<(i8 0)>; // alt. COND_NBE +def X86_COND_AE : PatLeaf<(i8 1)>; // alt. COND_NC +def X86_COND_B : PatLeaf<(i8 2)>; // alt. COND_C +def X86_COND_BE : PatLeaf<(i8 3)>; // alt. COND_NA +def X86_COND_E : PatLeaf<(i8 4)>; // alt. COND_Z +def X86_COND_G : PatLeaf<(i8 5)>; // alt. COND_NLE +def X86_COND_GE : PatLeaf<(i8 6)>; // alt. COND_NL +def X86_COND_L : PatLeaf<(i8 7)>; // alt. COND_NGE +def X86_COND_LE : PatLeaf<(i8 8)>; // alt. COND_NG +def X86_COND_NE : PatLeaf<(i8 9)>; // alt. COND_NZ +def X86_COND_NO : PatLeaf<(i8 10)>; +def X86_COND_NP : PatLeaf<(i8 11)>; // alt. COND_PO +def X86_COND_NS : PatLeaf<(i8 12)>; +def X86_COND_O : PatLeaf<(i8 13)>; +def X86_COND_P : PatLeaf<(i8 14)>; // alt. COND_PE +def X86_COND_S : PatLeaf<(i8 15)>; + +def i16immSExt8 : ImmLeaf<i16, [{ return isInt<8>(Imm); }]>; +def i32immSExt8 : ImmLeaf<i32, [{ return isInt<8>(Imm); }]>; +def i64immSExt8 : ImmLeaf<i64, [{ return isInt<8>(Imm); }]>; +def i64immSExt32 : ImmLeaf<i64, [{ return isInt<32>(Imm); }]>; + +// FIXME: Ideally we would just replace the above i*immSExt* matchers with +// relocImm-based matchers, but then FastISel would be unable to use them. +def i64relocImmSExt8 : PatLeaf<(i64 relocImm), [{ + return isSExtRelocImm<8>(N); +}]>; +def i64relocImmSExt32 : PatLeaf<(i64 relocImm), [{ + return isSExtRelocImm<32>(N); +}]>; + +// If we have multiple users of an immediate, it's much smaller to reuse +// the register, rather than encode the immediate in every instruction. +// This has the risk of increasing register pressure from stretched live +// ranges, however, the immediates should be trivial to rematerialize by +// the RA in the event of high register pressure. +// TODO : This is currently enabled for stores and binary ops. There are more +// cases for which this can be enabled, though this catches the bulk of the +// issues. +// TODO2 : This should really also be enabled under O2, but there's currently +// an issue with RA where we don't pull the constants into their users +// when we rematerialize them. I'll follow-up on enabling O2 after we fix that +// issue. +// TODO3 : This is currently limited to single basic blocks (DAG creation +// pulls block immediates to the top and merges them if necessary). +// Eventually, it would be nice to allow ConstantHoisting to merge constants +// globally for potentially added savings. +// +def imm8_su : PatLeaf<(i8 relocImm), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; +def imm16_su : PatLeaf<(i16 relocImm), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; +def imm32_su : PatLeaf<(i32 relocImm), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; +def i64immSExt32_su : PatLeaf<(i64immSExt32), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; + +def i16immSExt8_su : PatLeaf<(i16immSExt8), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; +def i32immSExt8_su : PatLeaf<(i32immSExt8), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; +def i64immSExt8_su : PatLeaf<(i64immSExt8), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; + +def i64relocImmSExt8_su : PatLeaf<(i64relocImmSExt8), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; +def i64relocImmSExt32_su : PatLeaf<(i64relocImmSExt32), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; + +// i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit +// unsigned field. +def i64immZExt32 : ImmLeaf<i64, [{ return isUInt<32>(Imm); }]>; + +def i64immZExt32SExt8 : ImmLeaf<i64, [{ + return isUInt<32>(Imm) && isInt<8>(static_cast<int32_t>(Imm)); +}]>; + +// Helper fragments for loads. + +// It's safe to fold a zextload/extload from i1 as a regular i8 load. The +// upper bits are guaranteed to be zero and we were going to emit a MOV8rm +// which might get folded during peephole anyway. +def loadi8 : PatFrag<(ops node:$ptr), (i8 (unindexedload node:$ptr)), [{ + LoadSDNode *LD = cast<LoadSDNode>(N); + ISD::LoadExtType ExtType = LD->getExtensionType(); + return ExtType == ISD::NON_EXTLOAD || ExtType == ISD::EXTLOAD || + ExtType == ISD::ZEXTLOAD; +}]>; + +// It's always safe to treat a anyext i16 load as a i32 load if the i16 is +// known to be 32-bit aligned or better. Ditto for i8 to i16. +def loadi16 : PatFrag<(ops node:$ptr), (i16 (unindexedload node:$ptr)), [{ + LoadSDNode *LD = cast<LoadSDNode>(N); + ISD::LoadExtType ExtType = LD->getExtensionType(); + if (ExtType == ISD::NON_EXTLOAD) + return true; + if (ExtType == ISD::EXTLOAD) + return LD->getAlignment() >= 2 && !LD->isVolatile(); + return false; +}]>; + +def loadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{ + LoadSDNode *LD = cast<LoadSDNode>(N); + ISD::LoadExtType ExtType = LD->getExtensionType(); + if (ExtType == ISD::NON_EXTLOAD) + return true; + if (ExtType == ISD::EXTLOAD) + return LD->getAlignment() >= 4 && !LD->isVolatile(); + return false; +}]>; + +def loadi64 : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>; +def loadf32 : PatFrag<(ops node:$ptr), (f32 (load node:$ptr))>; +def loadf64 : PatFrag<(ops node:$ptr), (f64 (load node:$ptr))>; +def loadf80 : PatFrag<(ops node:$ptr), (f80 (load node:$ptr))>; +def loadf128 : PatFrag<(ops node:$ptr), (f128 (load node:$ptr))>; +def alignedloadf128 : PatFrag<(ops node:$ptr), (f128 (load node:$ptr)), [{ + LoadSDNode *Ld = cast<LoadSDNode>(N); + return Ld->getAlignment() >= Ld->getMemoryVT().getStoreSize(); +}]>; +def memopf128 : PatFrag<(ops node:$ptr), (f128 (load node:$ptr)), [{ + LoadSDNode *Ld = cast<LoadSDNode>(N); + return Subtarget->hasSSEUnalignedMem() || + Ld->getAlignment() >= Ld->getMemoryVT().getStoreSize(); +}]>; + +def sextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (sextloadi8 node:$ptr))>; +def sextloadi32i8 : PatFrag<(ops node:$ptr), (i32 (sextloadi8 node:$ptr))>; +def sextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (sextloadi16 node:$ptr))>; +def sextloadi64i8 : PatFrag<(ops node:$ptr), (i64 (sextloadi8 node:$ptr))>; +def sextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (sextloadi16 node:$ptr))>; +def sextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (sextloadi32 node:$ptr))>; + +def zextloadi8i1 : PatFrag<(ops node:$ptr), (i8 (zextloadi1 node:$ptr))>; +def zextloadi16i1 : PatFrag<(ops node:$ptr), (i16 (zextloadi1 node:$ptr))>; +def zextloadi32i1 : PatFrag<(ops node:$ptr), (i32 (zextloadi1 node:$ptr))>; +def zextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (zextloadi8 node:$ptr))>; +def zextloadi32i8 : PatFrag<(ops node:$ptr), (i32 (zextloadi8 node:$ptr))>; +def zextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (zextloadi16 node:$ptr))>; +def zextloadi64i1 : PatFrag<(ops node:$ptr), (i64 (zextloadi1 node:$ptr))>; +def zextloadi64i8 : PatFrag<(ops node:$ptr), (i64 (zextloadi8 node:$ptr))>; +def zextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (zextloadi16 node:$ptr))>; +def zextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (zextloadi32 node:$ptr))>; + +def extloadi8i1 : PatFrag<(ops node:$ptr), (i8 (extloadi1 node:$ptr))>; +def extloadi16i1 : PatFrag<(ops node:$ptr), (i16 (extloadi1 node:$ptr))>; +def extloadi32i1 : PatFrag<(ops node:$ptr), (i32 (extloadi1 node:$ptr))>; +def extloadi16i8 : PatFrag<(ops node:$ptr), (i16 (extloadi8 node:$ptr))>; +def extloadi32i8 : PatFrag<(ops node:$ptr), (i32 (extloadi8 node:$ptr))>; +def extloadi32i16 : PatFrag<(ops node:$ptr), (i32 (extloadi16 node:$ptr))>; +def extloadi64i1 : PatFrag<(ops node:$ptr), (i64 (extloadi1 node:$ptr))>; +def extloadi64i8 : PatFrag<(ops node:$ptr), (i64 (extloadi8 node:$ptr))>; +def extloadi64i16 : PatFrag<(ops node:$ptr), (i64 (extloadi16 node:$ptr))>; +def extloadi64i32 : PatFrag<(ops node:$ptr), (i64 (extloadi32 node:$ptr))>; + + +// An 'and' node with a single use. +def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{ + return N->hasOneUse(); +}]>; +// An 'srl' node with a single use. +def srl_su : PatFrag<(ops node:$lhs, node:$rhs), (srl node:$lhs, node:$rhs), [{ + return N->hasOneUse(); +}]>; +// An 'trunc' node with a single use. +def trunc_su : PatFrag<(ops node:$src), (trunc node:$src), [{ + return N->hasOneUse(); +}]>; + +//===----------------------------------------------------------------------===// +// Instruction list. +// + +// Nop +let hasSideEffects = 0, SchedRW = [WriteNop] in { + def NOOP : I<0x90, RawFrm, (outs), (ins), "nop", []>; + def NOOPW : I<0x1f, MRMXm, (outs), (ins i16mem:$zero), + "nop{w}\t$zero", []>, TB, OpSize16, NotMemoryFoldable; + def NOOPL : I<0x1f, MRMXm, (outs), (ins i32mem:$zero), + "nop{l}\t$zero", []>, TB, OpSize32, NotMemoryFoldable; + def NOOPQ : RI<0x1f, MRMXm, (outs), (ins i64mem:$zero), + "nop{q}\t$zero", []>, TB, NotMemoryFoldable, + Requires<[In64BitMode]>; + // Also allow register so we can assemble/disassemble + def NOOPWr : I<0x1f, MRMXr, (outs), (ins GR16:$zero), + "nop{w}\t$zero", []>, TB, OpSize16, NotMemoryFoldable; + def NOOPLr : I<0x1f, MRMXr, (outs), (ins GR32:$zero), + "nop{l}\t$zero", []>, TB, OpSize32, NotMemoryFoldable; + def NOOPQr : RI<0x1f, MRMXr, (outs), (ins GR64:$zero), + "nop{q}\t$zero", []>, TB, NotMemoryFoldable, + Requires<[In64BitMode]>; + def NOOPW_19 : I<0x19, MRMXm, (outs), (ins i16mem:$zero), + "nop{w}\t$zero", []>, TB, OpSize16; + def NOOPL_19 : I<0x19, MRMXm, (outs), (ins i32mem:$zero), + "nop{l}\t$zero", []>, TB, OpSize32; + //def NOOPW_1a : I<0x1a, MRMXm, (outs), (ins i16mem:$zero), + // "nop{w}\t$zero", []>, TB, OpSize16; + //def NOOPL_1a : I<0x1a, MRMXm, (outs), (ins i32mem:$zero), + // "nop{l}\t$zero", []>, TB, OpSize32; + //def NOOPW_1b : I<0x1b, MRMXm, (outs), (ins i16mem:$zero), + // "nop{w}\t$zero", []>, TB, OpSize16; + //def NOOPL_1b : I<0x1b, MRMXm, (outs), (ins i32mem:$zero), + // "nop{l}\t$zero", []>, TB, OpSize32; + def NOOPW_1c : I<0x1c, MRMXm, (outs), (ins i16mem:$zero), + "nop{w}\t$zero", []>, TB, OpSize16; + //def NOOPL_1c : I<0x1c, MRMXm, (outs), (ins i32mem:$zero), + // "nop{l}\t$zero", []>, TB, OpSize32; + def NOOPW_1d : I<0x1d, MRMXm, (outs), (ins i16mem:$zero), + "nop{w}\t$zero", []>, TB, OpSize16; + def NOOPL_1d : I<0x1d, MRMXm, (outs), (ins i32mem:$zero), + "nop{l}\t$zero", []>, TB, OpSize32; + def NOOPW_1e : I<0x1e, MRMXm, (outs), (ins i16mem:$zero), + "nop{w}\t$zero", []>, TB, OpSize16; + def NOOPL_1e : I<0x1e, MRMXm, (outs), (ins i32mem:$zero), + "nop{l}\t$zero", []>, TB, OpSize32; + + def NOOP18_16m4 : I<0x18, MRM4m, (outs), (ins i16mem:$zero), + "nop{w}\t$zero", []>, TB, OpSize16; + def NOOP18_m4 : I<0x18, MRM4m, (outs), (ins i32mem:$zero), + "nop{l}\t$zero", []>, TB, OpSize32; + + def NOOP18_16r4 : I<0x18, MRM4r, (outs), (ins GR16:$zero), + "nop{w}\t$zero", []>, TB, OpSize16; + def NOOP18_r4 : I<0x18, MRM4r, (outs), (ins GR32:$zero), + "nop{l}\t$zero", []>, TB, OpSize32; + + def NOOP18_16m5 : I<0x18, MRM5m, (outs), (ins i16mem:$zero), + "nop{w}\t$zero", []>, TB, OpSize16; + def NOOP18_m5 : I<0x18, MRM5m, (outs), (ins i32mem:$zero), + "nop{l}\t$zero", []>, TB, OpSize32; + + def NOOP18_16r5 : I<0x18, MRM5r, (outs), (ins GR16:$zero), + "nop{w}\t$zero", []>, TB, OpSize16; + def NOOP18_r5 : I<0x18, MRM5r, (outs), (ins GR32:$zero), + "nop{l}\t$zero", []>, TB, OpSize32; + + def NOOP18_16m6 : I<0x18, MRM6m, (outs), (ins i16mem:$zero), + "nop{w}\t$zero", []>, TB, OpSize16; + def NOOP18_m6 : I<0x18, MRM6m, (outs), (ins i32mem:$zero), + "nop{l}\t$zero", []>, TB, OpSize32; + + def NOOP18_16r6 : I<0x18, MRM6r, (outs), (ins GR16:$zero), + "nop{w}\t$zero", []>, TB, OpSize16; + def NOOP18_r6 : I<0x18, MRM6r, (outs), (ins GR32:$zero), + "nop{l}\t$zero", []>, TB, OpSize32; + + def NOOP18_16m7 : I<0x18, MRM7m, (outs), (ins i16mem:$zero), + "nop{w}\t$zero", []>, TB, OpSize16; + def NOOP18_m7 : I<0x18, MRM7m, (outs), (ins i32mem:$zero), + "nop{l}\t$zero", []>, TB, OpSize32; + + def NOOP18_16r7 : I<0x18, MRM7r, (outs), (ins GR16:$zero), + "nop{w}\t$zero", []>, TB, OpSize16; + def NOOP18_r7 : I<0x18, MRM7r, (outs), (ins GR32:$zero), + "nop{l}\t$zero", []>, TB, OpSize32; +} + + +// Constructing a stack frame. +def ENTER : Ii16<0xC8, RawFrmImm8, (outs), (ins i16imm:$len, i8imm:$lvl), + "enter\t$len, $lvl", []>, Sched<[WriteMicrocoded]>; + +let SchedRW = [WriteALU] in { +let Defs = [EBP, ESP], Uses = [EBP, ESP], mayLoad = 1, hasSideEffects=0 in +def LEAVE : I<0xC9, RawFrm, (outs), (ins), "leave", []>, + Requires<[Not64BitMode]>; + +let Defs = [RBP,RSP], Uses = [RBP,RSP], mayLoad = 1, hasSideEffects = 0 in +def LEAVE64 : I<0xC9, RawFrm, (outs), (ins), "leave", []>, + Requires<[In64BitMode]>; +} // SchedRW + +//===----------------------------------------------------------------------===// +// Miscellaneous Instructions. +// + +/* +let isBarrier = 1, hasSideEffects = 1, usesCustomInserter = 1, + SchedRW = [WriteSystem] in + def Int_eh_sjlj_setup_dispatch + : PseudoI<(outs), (ins), [(X86eh_sjlj_setup_dispatch)]>; +*/ + +let Defs = [ESP], Uses = [ESP], hasSideEffects=0 in { +let mayLoad = 1, SchedRW = [WriteLoad] in { +def POP16r : I<0x58, AddRegFrm, (outs GR16:$reg), (ins), "pop{w}\t$reg", []>, + OpSize16; +def POP32r : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", []>, + OpSize32, Requires<[Not64BitMode]>; +// Long form for the disassembler. +let isCodeGenOnly = 1, ForceDisassemble = 1 in { +def POP16rmr: I<0x8F, MRM0r, (outs GR16:$reg), (ins), "pop{w}\t$reg", []>, + OpSize16, NotMemoryFoldable; +def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", []>, + OpSize32, Requires<[Not64BitMode]>, NotMemoryFoldable; +} // isCodeGenOnly = 1, ForceDisassemble = 1 +} // mayLoad, SchedRW +let mayStore = 1, mayLoad = 1, SchedRW = [WriteRMW] in { +def POP16rmm: I<0x8F, MRM0m, (outs), (ins i16mem:$dst), "pop{w}\t$dst", []>, + OpSize16; +def POP32rmm: I<0x8F, MRM0m, (outs), (ins i32mem:$dst), "pop{l}\t$dst", []>, + OpSize32, Requires<[Not64BitMode]>; +} // mayStore, mayLoad, WriteRMW + +let mayStore = 1, SchedRW = [WriteStore] in { +def PUSH16r : I<0x50, AddRegFrm, (outs), (ins GR16:$reg), "push{w}\t$reg",[]>, + OpSize16; +def PUSH32r : I<0x50, AddRegFrm, (outs), (ins GR32:$reg), "push{l}\t$reg",[]>, + OpSize32, Requires<[Not64BitMode]>; +// Long form for the disassembler. +let isCodeGenOnly = 1, ForceDisassemble = 1 in { +def PUSH16rmr: I<0xFF, MRM6r, (outs), (ins GR16:$reg), "push{w}\t$reg",[]>, + OpSize16, NotMemoryFoldable; +def PUSH32rmr: I<0xFF, MRM6r, (outs), (ins GR32:$reg), "push{l}\t$reg",[]>, + OpSize32, Requires<[Not64BitMode]>, NotMemoryFoldable; +} // isCodeGenOnly = 1, ForceDisassemble = 1 + +def PUSH16i8 : Ii8<0x6a, RawFrm, (outs), (ins i16i8imm:$imm), + "push{w}\t$imm", []>, OpSize16; +def PUSHi16 : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm), + "push{w}\t$imm", []>, OpSize16; + +def PUSH32i8 : Ii8<0x6a, RawFrm, (outs), (ins i32i8imm:$imm), + "push{l}\t$imm", []>, OpSize32, + Requires<[Not64BitMode]>; +def PUSHi32 : Ii32<0x68, RawFrm, (outs), (ins i32imm:$imm), + "push{l}\t$imm", []>, OpSize32, + Requires<[Not64BitMode]>; +} // mayStore, SchedRW + +let mayLoad = 1, mayStore = 1, SchedRW = [WriteRMW] in { +def PUSH16rmm: I<0xFF, MRM6m, (outs), (ins i16mem:$src), "push{w}\t$src", []>, + OpSize16; +def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src", []>, + OpSize32, Requires<[Not64BitMode]>; +} // mayLoad, mayStore, SchedRW + +} + +/* +let mayLoad = 1, mayStore = 1, usesCustomInserter = 1, + SchedRW = [WriteRMW], Defs = [ESP] in { + let Uses = [ESP] in + def RDFLAGS32 : PseudoI<(outs GR32:$dst), (ins), + [(set GR32:$dst, (int_x86_flags_read_u32))]>, + Requires<[Not64BitMode]>; + + let Uses = [RSP] in + def RDFLAGS64 : PseudoI<(outs GR64:$dst), (ins), + [(set GR64:$dst, (int_x86_flags_read_u64))]>, + Requires<[In64BitMode]>; +} + +let mayLoad = 1, mayStore = 1, usesCustomInserter = 1, + SchedRW = [WriteRMW] in { + let Defs = [ESP, EFLAGS, DF], Uses = [ESP] in + def WRFLAGS32 : PseudoI<(outs), (ins GR32:$src), + [(int_x86_flags_write_u32 GR32:$src)]>, + Requires<[Not64BitMode]>; + + let Defs = [RSP, EFLAGS, DF], Uses = [RSP] in + def WRFLAGS64 : PseudoI<(outs), (ins GR64:$src), + [(int_x86_flags_write_u64 GR64:$src)]>, + Requires<[In64BitMode]>; +} +*/ + +let Defs = [ESP, EFLAGS, DF], Uses = [ESP], mayLoad = 1, hasSideEffects=0, + SchedRW = [WriteLoad] in { +def POPF16 : I<0x9D, RawFrm, (outs), (ins), "popf{w}", []>, OpSize16; +def POPF32 : I<0x9D, RawFrm, (outs), (ins), "popf{l|d}", []>, OpSize32, + Requires<[Not64BitMode]>; +} + +let Defs = [ESP], Uses = [ESP, EFLAGS, DF], mayStore = 1, hasSideEffects=0, + SchedRW = [WriteStore] in { +def PUSHF16 : I<0x9C, RawFrm, (outs), (ins), "pushf{w}", []>, OpSize16; +def PUSHF32 : I<0x9C, RawFrm, (outs), (ins), "pushf{l|d}", []>, OpSize32, + Requires<[Not64BitMode]>; +} + +let Defs = [RSP], Uses = [RSP], hasSideEffects=0 in { +let mayLoad = 1, SchedRW = [WriteLoad] in { +def POP64r : I<0x58, AddRegFrm, (outs GR64:$reg), (ins), "pop{q}\t$reg", []>, + OpSize32, Requires<[In64BitMode]>; +// Long form for the disassembler. +let isCodeGenOnly = 1, ForceDisassemble = 1 in { +def POP64rmr: I<0x8F, MRM0r, (outs GR64:$reg), (ins), "pop{q}\t$reg", []>, + OpSize32, Requires<[In64BitMode]>, NotMemoryFoldable; +} // isCodeGenOnly = 1, ForceDisassemble = 1 +} // mayLoad, SchedRW +let mayLoad = 1, mayStore = 1, SchedRW = [WriteRMW] in +def POP64rmm: I<0x8F, MRM0m, (outs), (ins i64mem:$dst), "pop{q}\t$dst", []>, + OpSize32, Requires<[In64BitMode]>; +let mayStore = 1, SchedRW = [WriteStore] in { +def PUSH64r : I<0x50, AddRegFrm, (outs), (ins GR64:$reg), "push{q}\t$reg", []>, + OpSize32, Requires<[In64BitMode]>; +// Long form for the disassembler. +let isCodeGenOnly = 1, ForceDisassemble = 1 in { +def PUSH64rmr: I<0xFF, MRM6r, (outs), (ins GR64:$reg), "push{q}\t$reg", []>, + OpSize32, Requires<[In64BitMode]>, NotMemoryFoldable; +} // isCodeGenOnly = 1, ForceDisassemble = 1 +} // mayStore, SchedRW +let mayLoad = 1, mayStore = 1, SchedRW = [WriteRMW] in { +def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", []>, + OpSize32, Requires<[In64BitMode]>; +} // mayLoad, mayStore, SchedRW +} + +let Defs = [RSP], Uses = [RSP], hasSideEffects = 0, mayStore = 1, + SchedRW = [WriteStore] in { +def PUSH64i8 : Ii8<0x6a, RawFrm, (outs), (ins i64i8imm:$imm), + "push{q}\t$imm", []>, OpSize32, + Requires<[In64BitMode]>; +def PUSH64i32 : Ii32S<0x68, RawFrm, (outs), (ins i64i32imm:$imm), + "push{q}\t$imm", []>, OpSize32, + Requires<[In64BitMode]>; +} + +let Defs = [RSP, EFLAGS, DF], Uses = [RSP], mayLoad = 1, hasSideEffects=0 in +def POPF64 : I<0x9D, RawFrm, (outs), (ins), "popfq", []>, + OpSize32, Requires<[In64BitMode]>, Sched<[WriteLoad]>; +let Defs = [RSP], Uses = [RSP, EFLAGS, DF], mayStore = 1, hasSideEffects=0 in +def PUSHF64 : I<0x9C, RawFrm, (outs), (ins), "pushfq", []>, + OpSize32, Requires<[In64BitMode]>, Sched<[WriteStore]>; + +let Defs = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], Uses = [ESP], + mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteLoad] in { +def POPA32 : I<0x61, RawFrm, (outs), (ins), "popal", []>, + OpSize32, Requires<[Not64BitMode]>; +def POPA16 : I<0x61, RawFrm, (outs), (ins), "popaw", []>, + OpSize16, Requires<[Not64BitMode]>; +} +let Defs = [ESP], Uses = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], + mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in { +def PUSHA32 : I<0x60, RawFrm, (outs), (ins), "pushal", []>, + OpSize32, Requires<[Not64BitMode]>; +def PUSHA16 : I<0x60, RawFrm, (outs), (ins), "pushaw", []>, + OpSize16, Requires<[Not64BitMode]>; +} + +let Constraints = "$src = $dst", SchedRW = [WriteBSWAP32] in { +// This instruction is a consequence of BSWAP32r observing operand size. The +// encoding is valid, but the behavior is undefined. +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in +def BSWAP16r_BAD : I<0xC8, AddRegFrm, (outs GR16:$dst), (ins GR16:$src), + "bswap{w}\t$dst", []>, OpSize16, TB; +// GR32 = bswap GR32 +def BSWAP32r : I<0xC8, AddRegFrm, (outs GR32:$dst), (ins GR32:$src), + "bswap{l}\t$dst", + [(set GR32:$dst, (bswap GR32:$src))]>, OpSize32, TB; + +let SchedRW = [WriteBSWAP64] in +def BSWAP64r : RI<0xC8, AddRegFrm, (outs GR64:$dst), (ins GR64:$src), + "bswap{q}\t$dst", + [(set GR64:$dst, (bswap GR64:$src))]>, TB; +} // Constraints = "$src = $dst", SchedRW + +// Bit scan instructions. +let Defs = [EFLAGS] in { +def BSF16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), + "bsf{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))]>, + PS, OpSize16, Sched<[WriteBSF]>; +def BSF16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "bsf{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))]>, + PS, OpSize16, Sched<[WriteBSFLd]>; +def BSF32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "bsf{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))]>, + PS, OpSize32, Sched<[WriteBSF]>; +def BSF32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "bsf{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))]>, + PS, OpSize32, Sched<[WriteBSFLd]>; +def BSF64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), + "bsf{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))]>, + PS, Sched<[WriteBSF]>; +def BSF64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "bsf{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))]>, + PS, Sched<[WriteBSFLd]>; + +def BSR16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), + "bsr{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))]>, + PS, OpSize16, Sched<[WriteBSR]>; +def BSR16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "bsr{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))]>, + PS, OpSize16, Sched<[WriteBSRLd]>; +def BSR32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "bsr{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))]>, + PS, OpSize32, Sched<[WriteBSR]>; +def BSR32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "bsr{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))]>, + PS, OpSize32, Sched<[WriteBSRLd]>; +def BSR64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), + "bsr{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))]>, + PS, Sched<[WriteBSR]>; +def BSR64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "bsr{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))]>, + PS, Sched<[WriteBSRLd]>; +} // Defs = [EFLAGS] + +let SchedRW = [WriteMicrocoded] in { +let Defs = [EDI,ESI], Uses = [EDI,ESI,DF] in { +def MOVSB : I<0xA4, RawFrmDstSrc, (outs), (ins dstidx8:$dst, srcidx8:$src), + "movsb\t{$src, $dst|$dst, $src}", []>; +def MOVSW : I<0xA5, RawFrmDstSrc, (outs), (ins dstidx16:$dst, srcidx16:$src), + "movsw\t{$src, $dst|$dst, $src}", []>, OpSize16; +def MOVSL : I<0xA5, RawFrmDstSrc, (outs), (ins dstidx32:$dst, srcidx32:$src), + "movs{l|d}\t{$src, $dst|$dst, $src}", []>, OpSize32; +def MOVSQ : RI<0xA5, RawFrmDstSrc, (outs), (ins dstidx64:$dst, srcidx64:$src), + "movsq\t{$src, $dst|$dst, $src}", []>, + Requires<[In64BitMode]>; +} + +let Defs = [EDI], Uses = [AL,EDI,DF] in +def STOSB : I<0xAA, RawFrmDst, (outs), (ins dstidx8:$dst), + "stosb\t{%al, $dst|$dst, al}", []>; +let Defs = [EDI], Uses = [AX,EDI,DF] in +def STOSW : I<0xAB, RawFrmDst, (outs), (ins dstidx16:$dst), + "stosw\t{%ax, $dst|$dst, ax}", []>, OpSize16; +let Defs = [EDI], Uses = [EAX,EDI,DF] in +def STOSL : I<0xAB, RawFrmDst, (outs), (ins dstidx32:$dst), + "stos{l|d}\t{%eax, $dst|$dst, eax}", []>, OpSize32; +let Defs = [RDI], Uses = [RAX,RDI,DF] in +def STOSQ : RI<0xAB, RawFrmDst, (outs), (ins dstidx64:$dst), + "stosq\t{%rax, $dst|$dst, rax}", []>, + Requires<[In64BitMode]>; + +let Defs = [EDI,EFLAGS], Uses = [AL,EDI,DF] in +def SCASB : I<0xAE, RawFrmDst, (outs), (ins dstidx8:$dst), + "scasb\t{$dst, %al|al, $dst}", []>; +let Defs = [EDI,EFLAGS], Uses = [AX,EDI,DF] in +def SCASW : I<0xAF, RawFrmDst, (outs), (ins dstidx16:$dst), + "scasw\t{$dst, %ax|ax, $dst}", []>, OpSize16; +let Defs = [EDI,EFLAGS], Uses = [EAX,EDI,DF] in +def SCASL : I<0xAF, RawFrmDst, (outs), (ins dstidx32:$dst), + "scas{l|d}\t{$dst, %eax|eax, $dst}", []>, OpSize32; +let Defs = [EDI,EFLAGS], Uses = [RAX,EDI,DF] in +def SCASQ : RI<0xAF, RawFrmDst, (outs), (ins dstidx64:$dst), + "scasq\t{$dst, %rax|rax, $dst}", []>, + Requires<[In64BitMode]>; + +let Defs = [EDI,ESI,EFLAGS], Uses = [EDI,ESI,DF] in { +def CMPSB : I<0xA6, RawFrmDstSrc, (outs), (ins dstidx8:$dst, srcidx8:$src), + "cmpsb\t{$dst, $src|$src, $dst}", []>; +def CMPSW : I<0xA7, RawFrmDstSrc, (outs), (ins dstidx16:$dst, srcidx16:$src), + "cmpsw\t{$dst, $src|$src, $dst}", []>, OpSize16; +def CMPSL : I<0xA7, RawFrmDstSrc, (outs), (ins dstidx32:$dst, srcidx32:$src), + "cmps{l|d}\t{$dst, $src|$src, $dst}", []>, OpSize32; +def CMPSQ : RI<0xA7, RawFrmDstSrc, (outs), (ins dstidx64:$dst, srcidx64:$src), + "cmpsq\t{$dst, $src|$src, $dst}", []>, + Requires<[In64BitMode]>; +} +} // SchedRW + +//===----------------------------------------------------------------------===// +// Move Instructions. +// +let SchedRW = [WriteMove] in { +let hasSideEffects = 0, isMoveReg = 1 in { +def MOV8rr : I<0x88, MRMDestReg, (outs GR8 :$dst), (ins GR8 :$src), + "mov{b}\t{$src, $dst|$dst, $src}", []>; +def MOV16rr : I<0x89, MRMDestReg, (outs GR16:$dst), (ins GR16:$src), + "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize16; +def MOV32rr : I<0x89, MRMDestReg, (outs GR32:$dst), (ins GR32:$src), + "mov{l}\t{$src, $dst|$dst, $src}", []>, OpSize32; +def MOV64rr : RI<0x89, MRMDestReg, (outs GR64:$dst), (ins GR64:$src), + "mov{q}\t{$src, $dst|$dst, $src}", []>; +} + +let isReMaterializable = 1, isAsCheapAsAMove = 1 in { +def MOV8ri : Ii8 <0xB0, AddRegFrm, (outs GR8 :$dst), (ins i8imm :$src), + "mov{b}\t{$src, $dst|$dst, $src}", + [(set GR8:$dst, imm:$src)]>; +def MOV16ri : Ii16<0xB8, AddRegFrm, (outs GR16:$dst), (ins i16imm:$src), + "mov{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, imm:$src)]>, OpSize16; +def MOV32ri : Ii32<0xB8, AddRegFrm, (outs GR32:$dst), (ins i32imm:$src), + "mov{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, relocImm:$src)]>, OpSize32; +def MOV64ri32 : RIi32S<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src), + "mov{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, i64immSExt32:$src)]>; +} +let isReMaterializable = 1 in { +def MOV64ri : RIi64<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64imm:$src), + "movabs{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, relocImm:$src)]>; +} + +// Longer forms that use a ModR/M byte. Needed for disassembler +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { +def MOV8ri_alt : Ii8 <0xC6, MRM0r, (outs GR8 :$dst), (ins i8imm :$src), + "mov{b}\t{$src, $dst|$dst, $src}", []>, + FoldGenData<"MOV8ri">; +def MOV16ri_alt : Ii16<0xC7, MRM0r, (outs GR16:$dst), (ins i16imm:$src), + "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize16, + FoldGenData<"MOV16ri">; +def MOV32ri_alt : Ii32<0xC7, MRM0r, (outs GR32:$dst), (ins i32imm:$src), + "mov{l}\t{$src, $dst|$dst, $src}", []>, OpSize32, + FoldGenData<"MOV32ri">; +} +} // SchedRW + +let SchedRW = [WriteStore] in { +def MOV8mi : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src), + "mov{b}\t{$src, $dst|$dst, $src}", + [(store (i8 imm8_su:$src), addr:$dst)]>; +def MOV16mi : Ii16<0xC7, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src), + "mov{w}\t{$src, $dst|$dst, $src}", + [(store (i16 imm16_su:$src), addr:$dst)]>, OpSize16; +def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src), + "mov{l}\t{$src, $dst|$dst, $src}", + [(store (i32 imm32_su:$src), addr:$dst)]>, OpSize32; +def MOV64mi32 : RIi32S<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src), + "mov{q}\t{$src, $dst|$dst, $src}", + [(store i64immSExt32_su:$src, addr:$dst)]>, + Requires<[In64BitMode]>; +} // SchedRW + +let hasSideEffects = 0 in { + +/// Memory offset versions of moves. The immediate is an address mode sized +/// offset from the segment base. +let SchedRW = [WriteALU] in { +let mayLoad = 1 in { +let Defs = [AL] in +def MOV8ao32 : Ii32<0xA0, RawFrmMemOffs, (outs), (ins offset32_8:$src), + "mov{b}\t{$src, %al|al, $src}", []>, + AdSize32; +let Defs = [AX] in +def MOV16ao32 : Ii32<0xA1, RawFrmMemOffs, (outs), (ins offset32_16:$src), + "mov{w}\t{$src, %ax|ax, $src}", []>, + OpSize16, AdSize32; +let Defs = [EAX] in +def MOV32ao32 : Ii32<0xA1, RawFrmMemOffs, (outs), (ins offset32_32:$src), + "mov{l}\t{$src, %eax|eax, $src}", []>, + OpSize32, AdSize32; +let Defs = [RAX] in +def MOV64ao32 : RIi32<0xA1, RawFrmMemOffs, (outs), (ins offset32_64:$src), + "mov{q}\t{$src, %rax|rax, $src}", []>, + AdSize32; + +let Defs = [AL] in +def MOV8ao16 : Ii16<0xA0, RawFrmMemOffs, (outs), (ins offset16_8:$src), + "mov{b}\t{$src, %al|al, $src}", []>, AdSize16; +let Defs = [AX] in +def MOV16ao16 : Ii16<0xA1, RawFrmMemOffs, (outs), (ins offset16_16:$src), + "mov{w}\t{$src, %ax|ax, $src}", []>, + OpSize16, AdSize16; +let Defs = [EAX] in +def MOV32ao16 : Ii16<0xA1, RawFrmMemOffs, (outs), (ins offset16_32:$src), + "mov{l}\t{$src, %eax|eax, $src}", []>, + AdSize16, OpSize32; +} // mayLoad +let mayStore = 1 in { +let Uses = [AL] in +def MOV8o32a : Ii32<0xA2, RawFrmMemOffs, (outs), (ins offset32_8:$dst), + "mov{b}\t{%al, $dst|$dst, al}", []>, AdSize32; +let Uses = [AX] in +def MOV16o32a : Ii32<0xA3, RawFrmMemOffs, (outs), (ins offset32_16:$dst), + "mov{w}\t{%ax, $dst|$dst, ax}", []>, + OpSize16, AdSize32; +let Uses = [EAX] in +def MOV32o32a : Ii32<0xA3, RawFrmMemOffs, (outs), (ins offset32_32:$dst), + "mov{l}\t{%eax, $dst|$dst, eax}", []>, + OpSize32, AdSize32; +let Uses = [RAX] in +def MOV64o32a : RIi32<0xA3, RawFrmMemOffs, (outs), (ins offset32_64:$dst), + "mov{q}\t{%rax, $dst|$dst, rax}", []>, + AdSize32; + +let Uses = [AL] in +def MOV8o16a : Ii16<0xA2, RawFrmMemOffs, (outs), (ins offset16_8:$dst), + "mov{b}\t{%al, $dst|$dst, al}", []>, AdSize16; +let Uses = [AX] in +def MOV16o16a : Ii16<0xA3, RawFrmMemOffs, (outs), (ins offset16_16:$dst), + "mov{w}\t{%ax, $dst|$dst, ax}", []>, + OpSize16, AdSize16; +let Uses = [EAX] in +def MOV32o16a : Ii16<0xA3, RawFrmMemOffs, (outs), (ins offset16_32:$dst), + "mov{l}\t{%eax, $dst|$dst, eax}", []>, + OpSize32, AdSize16; +} // mayStore + +// These forms all have full 64-bit absolute addresses in their instructions +// and use the movabs mnemonic to indicate this specific form. +let mayLoad = 1 in { +let Defs = [AL] in +def MOV8ao64 : Ii64<0xA0, RawFrmMemOffs, (outs), (ins offset64_8:$src), + "movabs{b}\t{$src, %al|al, $src}", []>, + AdSize64; +let Defs = [AX] in +def MOV16ao64 : Ii64<0xA1, RawFrmMemOffs, (outs), (ins offset64_16:$src), + "movabs{w}\t{$src, %ax|ax, $src}", []>, + OpSize16, AdSize64; +let Defs = [EAX] in +def MOV32ao64 : Ii64<0xA1, RawFrmMemOffs, (outs), (ins offset64_32:$src), + "movabs{l}\t{$src, %eax|eax, $src}", []>, + OpSize32, AdSize64; +let Defs = [RAX] in +def MOV64ao64 : RIi64<0xA1, RawFrmMemOffs, (outs), (ins offset64_64:$src), + "movabs{q}\t{$src, %rax|rax, $src}", []>, + AdSize64; +} // mayLoad + +let mayStore = 1 in { +let Uses = [AL] in +def MOV8o64a : Ii64<0xA2, RawFrmMemOffs, (outs), (ins offset64_8:$dst), + "movabs{b}\t{%al, $dst|$dst, al}", []>, + AdSize64; +let Uses = [AX] in +def MOV16o64a : Ii64<0xA3, RawFrmMemOffs, (outs), (ins offset64_16:$dst), + "movabs{w}\t{%ax, $dst|$dst, ax}", []>, + OpSize16, AdSize64; +let Uses = [EAX] in +def MOV32o64a : Ii64<0xA3, RawFrmMemOffs, (outs), (ins offset64_32:$dst), + "movabs{l}\t{%eax, $dst|$dst, eax}", []>, + OpSize32, AdSize64; +let Uses = [RAX] in +def MOV64o64a : RIi64<0xA3, RawFrmMemOffs, (outs), (ins offset64_64:$dst), + "movabs{q}\t{%rax, $dst|$dst, rax}", []>, + AdSize64; +} // mayStore +} // SchedRW +} // hasSideEffects = 0 + +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, + SchedRW = [WriteMove], isMoveReg = 1 in { +def MOV8rr_REV : I<0x8A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src), + "mov{b}\t{$src, $dst|$dst, $src}", []>, + FoldGenData<"MOV8rr">; +def MOV16rr_REV : I<0x8B, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), + "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize16, + FoldGenData<"MOV16rr">; +def MOV32rr_REV : I<0x8B, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "mov{l}\t{$src, $dst|$dst, $src}", []>, OpSize32, + FoldGenData<"MOV32rr">; +def MOV64rr_REV : RI<0x8B, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), + "mov{q}\t{$src, $dst|$dst, $src}", []>, + FoldGenData<"MOV64rr">; +} + +// Reversed version with ".s" suffix for GAS compatibility. +//def : InstAlias<"mov{b}.s\t{$src, $dst|$dst, $src}", +// (MOV8rr_REV GR8:$dst, GR8:$src), 0>; +//def : InstAlias<"mov{w}.s\t{$src, $dst|$dst, $src}", +// (MOV16rr_REV GR16:$dst, GR16:$src), 0>; +//def : InstAlias<"mov{l}.s\t{$src, $dst|$dst, $src}", +// (MOV32rr_REV GR32:$dst, GR32:$src), 0>; +//def : InstAlias<"mov{q}.s\t{$src, $dst|$dst, $src}", +// (MOV64rr_REV GR64:$dst, GR64:$src), 0>; +//def : InstAlias<"mov.s\t{$src, $dst|$dst, $src}", +// (MOV8rr_REV GR8:$dst, GR8:$src), 0, "att">; +//def : InstAlias<"mov.s\t{$src, $dst|$dst, $src}", +// (MOV16rr_REV GR16:$dst, GR16:$src), 0, "att">; +//def : InstAlias<"mov.s\t{$src, $dst|$dst, $src}", +// (MOV32rr_REV GR32:$dst, GR32:$src), 0, "att">; +//def : InstAlias<"mov.s\t{$src, $dst|$dst, $src}", +// (MOV64rr_REV GR64:$dst, GR64:$src), 0, "att">; + +let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in { +def MOV8rm : I<0x8A, MRMSrcMem, (outs GR8 :$dst), (ins i8mem :$src), + "mov{b}\t{$src, $dst|$dst, $src}", + [(set GR8:$dst, (loadi8 addr:$src))]>; +def MOV16rm : I<0x8B, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "mov{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, (loadi16 addr:$src))]>, OpSize16; +def MOV32rm : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "mov{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (loadi32 addr:$src))]>, OpSize32; +def MOV64rm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "mov{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (load addr:$src))]>; +} + +let SchedRW = [WriteStore] in { +def MOV8mr : I<0x88, MRMDestMem, (outs), (ins i8mem :$dst, GR8 :$src), + "mov{b}\t{$src, $dst|$dst, $src}", + [(store GR8:$src, addr:$dst)]>; +def MOV16mr : I<0x89, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src), + "mov{w}\t{$src, $dst|$dst, $src}", + [(store GR16:$src, addr:$dst)]>, OpSize16; +def MOV32mr : I<0x89, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), + "mov{l}\t{$src, $dst|$dst, $src}", + [(store GR32:$src, addr:$dst)]>, OpSize32; +def MOV64mr : RI<0x89, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), + "mov{q}\t{$src, $dst|$dst, $src}", + [(store GR64:$src, addr:$dst)]>; +} // SchedRW + +// Versions of MOV8rr, MOV8mr, and MOV8rm that use i8mem_NOREX and GR8_NOREX so +// that they can be used for copying and storing h registers, which can't be +// encoded when a REX prefix is present. +let isCodeGenOnly = 1 in { +let hasSideEffects = 0, isMoveReg = 1 in +def MOV8rr_NOREX : I<0x88, MRMDestReg, + (outs GR8_NOREX:$dst), (ins GR8_NOREX:$src), + "mov{b}\t{$src, $dst|$dst, $src}", []>, + Sched<[WriteMove]>; +let mayStore = 1, hasSideEffects = 0 in +def MOV8mr_NOREX : I<0x88, MRMDestMem, + (outs), (ins i8mem_NOREX:$dst, GR8_NOREX:$src), + "mov{b}\t{$src, $dst|$dst, $src}", []>, + Sched<[WriteStore]>; +let mayLoad = 1, hasSideEffects = 0, + canFoldAsLoad = 1, isReMaterializable = 1 in +def MOV8rm_NOREX : I<0x8A, MRMSrcMem, + (outs GR8_NOREX:$dst), (ins i8mem_NOREX:$src), + "mov{b}\t{$src, $dst|$dst, $src}", []>, + Sched<[WriteLoad]>; +} + + +// Condition code ops, incl. set if equal/not equal/... +let SchedRW = [WriteLAHFSAHF] in { +let Defs = [EFLAGS], Uses = [AH] in +def SAHF : I<0x9E, RawFrm, (outs), (ins), "sahf", + [(set EFLAGS, (X86sahf AH))]>, + Requires<[HasLAHFSAHF]>; +let Defs = [AH], Uses = [EFLAGS], hasSideEffects = 0 in +def LAHF : I<0x9F, RawFrm, (outs), (ins), "lahf", []>, // AH = flags + Requires<[HasLAHFSAHF]>; +} // SchedRW + +//===----------------------------------------------------------------------===// +// Bit tests instructions: BT, BTS, BTR, BTC. + +let Defs = [EFLAGS] in { +let SchedRW = [WriteBitTest] in { +def BT16rr : I<0xA3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2), + "bt{w}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt GR16:$src1, GR16:$src2))]>, + OpSize16, TB, NotMemoryFoldable; +def BT32rr : I<0xA3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2), + "bt{l}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt GR32:$src1, GR32:$src2))]>, + OpSize32, TB, NotMemoryFoldable; +def BT64rr : RI<0xA3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2), + "bt{q}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt GR64:$src1, GR64:$src2))]>, TB, + NotMemoryFoldable; +} // SchedRW + +// Unlike with the register+register form, the memory+register form of the +// bt instruction does not ignore the high bits of the index. From ISel's +// perspective, this is pretty bizarre. Make these instructions disassembly +// only for now. These instructions are also slow on modern CPUs so that's +// another reason to avoid generating them. + +let mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteALULd] in { + def BT16mr : I<0xA3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), + "bt{w}\t{$src2, $src1|$src1, $src2}", + []>, OpSize16, TB, NotMemoryFoldable; + def BT32mr : I<0xA3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2), + "bt{l}\t{$src2, $src1|$src1, $src2}", + []>, OpSize32, TB, NotMemoryFoldable; + def BT64mr : RI<0xA3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), + "bt{q}\t{$src2, $src1|$src1, $src2}", + []>, TB, NotMemoryFoldable; +} + +let SchedRW = [WriteBitTest] in { +def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16i8imm:$src2), + "bt{w}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt GR16:$src1, i16immSExt8:$src2))]>, + OpSize16, TB; +def BT32ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR32:$src1, i32i8imm:$src2), + "bt{l}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt GR32:$src1, i32immSExt8:$src2))]>, + OpSize32, TB; +def BT64ri8 : RIi8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64i8imm:$src2), + "bt{q}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt GR64:$src1, i64immSExt8:$src2))]>, TB; +} // SchedRW + +// Note that these instructions aren't slow because that only applies when the +// other operand is in a register. When it's an immediate, bt is still fast. +let SchedRW = [WriteALU] in { +def BT16mi8 : Ii8<0xBA, MRM4m, (outs), (ins i16mem:$src1, i16i8imm:$src2), + "bt{w}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt (loadi16 addr:$src1), + i16immSExt8:$src2))]>, + OpSize16, TB; +def BT32mi8 : Ii8<0xBA, MRM4m, (outs), (ins i32mem:$src1, i32i8imm:$src2), + "bt{l}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt (loadi32 addr:$src1), + i32immSExt8:$src2))]>, + OpSize32, TB; +def BT64mi8 : RIi8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64i8imm:$src2), + "bt{q}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt (loadi64 addr:$src1), + i64immSExt8:$src2))]>, TB, + Requires<[In64BitMode]>; +} // SchedRW + +let hasSideEffects = 0 in { +let SchedRW = [WriteBitTest], Constraints = "$src1 = $dst" in { +def BTC16rr : I<0xBB, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), + "btc{w}\t{$src2, $src1|$src1, $src2}", []>, + OpSize16, TB, NotMemoryFoldable; +def BTC32rr : I<0xBB, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), + "btc{l}\t{$src2, $src1|$src1, $src2}", []>, + OpSize32, TB, NotMemoryFoldable; +def BTC64rr : RI<0xBB, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), + "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB, + NotMemoryFoldable; +} // SchedRW + +let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { +def BTC16mr : I<0xBB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), + "btc{w}\t{$src2, $src1|$src1, $src2}", []>, + OpSize16, TB, NotMemoryFoldable; +def BTC32mr : I<0xBB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2), + "btc{l}\t{$src2, $src1|$src1, $src2}", []>, + OpSize32, TB, NotMemoryFoldable; +def BTC64mr : RI<0xBB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), + "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB, + NotMemoryFoldable; +} + +let SchedRW = [WriteBitTest], Constraints = "$src1 = $dst" in { +def BTC16ri8 : Ii8<0xBA, MRM7r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), + "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB; +def BTC32ri8 : Ii8<0xBA, MRM7r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2), + "btc{l}\t{$src2, $src1|$src1, $src2}", []>, OpSize32, TB; +def BTC64ri8 : RIi8<0xBA, MRM7r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2), + "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB; +} // SchedRW + +let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { +def BTC16mi8 : Ii8<0xBA, MRM7m, (outs), (ins i16mem:$src1, i16i8imm:$src2), + "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB; +def BTC32mi8 : Ii8<0xBA, MRM7m, (outs), (ins i32mem:$src1, i32i8imm:$src2), + "btc{l}\t{$src2, $src1|$src1, $src2}", []>, OpSize32, TB; +def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64i8imm:$src2), + "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB, + Requires<[In64BitMode]>; +} + +let SchedRW = [WriteBitTest], Constraints = "$src1 = $dst" in { +def BTR16rr : I<0xB3, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), + "btr{w}\t{$src2, $src1|$src1, $src2}", []>, + OpSize16, TB, NotMemoryFoldable; +def BTR32rr : I<0xB3, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), + "btr{l}\t{$src2, $src1|$src1, $src2}", []>, + OpSize32, TB, NotMemoryFoldable; +def BTR64rr : RI<0xB3, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), + "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB, + NotMemoryFoldable; +} // SchedRW + +let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { +def BTR16mr : I<0xB3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), + "btr{w}\t{$src2, $src1|$src1, $src2}", []>, + OpSize16, TB, NotMemoryFoldable; +def BTR32mr : I<0xB3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2), + "btr{l}\t{$src2, $src1|$src1, $src2}", []>, + OpSize32, TB, NotMemoryFoldable; +def BTR64mr : RI<0xB3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), + "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB, + NotMemoryFoldable; +} + +let SchedRW = [WriteBitTest], Constraints = "$src1 = $dst" in { +def BTR16ri8 : Ii8<0xBA, MRM6r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), + "btr{w}\t{$src2, $src1|$src1, $src2}", []>, + OpSize16, TB; +def BTR32ri8 : Ii8<0xBA, MRM6r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2), + "btr{l}\t{$src2, $src1|$src1, $src2}", []>, + OpSize32, TB; +def BTR64ri8 : RIi8<0xBA, MRM6r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2), + "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB; +} // SchedRW + +let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { +def BTR16mi8 : Ii8<0xBA, MRM6m, (outs), (ins i16mem:$src1, i16i8imm:$src2), + "btr{w}\t{$src2, $src1|$src1, $src2}", []>, + OpSize16, TB; +def BTR32mi8 : Ii8<0xBA, MRM6m, (outs), (ins i32mem:$src1, i32i8imm:$src2), + "btr{l}\t{$src2, $src1|$src1, $src2}", []>, + OpSize32, TB; +def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64i8imm:$src2), + "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB, + Requires<[In64BitMode]>; +} + +let SchedRW = [WriteBitTest], Constraints = "$src1 = $dst" in { +def BTS16rr : I<0xAB, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), + "bts{w}\t{$src2, $src1|$src1, $src2}", []>, + OpSize16, TB, NotMemoryFoldable; +def BTS32rr : I<0xAB, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), + "bts{l}\t{$src2, $src1|$src1, $src2}", []>, + OpSize32, TB, NotMemoryFoldable; +def BTS64rr : RI<0xAB, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), + "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB, + NotMemoryFoldable; +} // SchedRW + +let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { +def BTS16mr : I<0xAB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), + "bts{w}\t{$src2, $src1|$src1, $src2}", []>, + OpSize16, TB, NotMemoryFoldable; +def BTS32mr : I<0xAB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2), + "bts{l}\t{$src2, $src1|$src1, $src2}", []>, + OpSize32, TB, NotMemoryFoldable; +def BTS64mr : RI<0xAB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), + "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB, + NotMemoryFoldable; +} + +let SchedRW = [WriteBitTest], Constraints = "$src1 = $dst" in { +def BTS16ri8 : Ii8<0xBA, MRM5r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), + "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB; +def BTS32ri8 : Ii8<0xBA, MRM5r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2), + "bts{l}\t{$src2, $src1|$src1, $src2}", []>, OpSize32, TB; +def BTS64ri8 : RIi8<0xBA, MRM5r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2), + "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB; +} // SchedRW + +let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { +def BTS16mi8 : Ii8<0xBA, MRM5m, (outs), (ins i16mem:$src1, i16i8imm:$src2), + "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB; +def BTS32mi8 : Ii8<0xBA, MRM5m, (outs), (ins i32mem:$src1, i32i8imm:$src2), + "bts{l}\t{$src2, $src1|$src1, $src2}", []>, OpSize32, TB; +def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2), + "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB, + Requires<[In64BitMode]>; +} +} // hasSideEffects = 0 +} // Defs = [EFLAGS] + + +//===----------------------------------------------------------------------===// +// Atomic support +// + +// Atomic swap. These are just normal xchg instructions. But since a memory +// operand is referenced, the atomicity is ensured. +multiclass ATOMIC_SWAP<bits<8> opc8, bits<8> opc, string mnemonic, string frag> { + let Constraints = "$val = $dst", SchedRW = [WriteALULd, WriteRMW] in { + def NAME#8rm : I<opc8, MRMSrcMem, (outs GR8:$dst), + (ins GR8:$val, i8mem:$ptr), + !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"), + [(set + GR8:$dst, + (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))]>; + def NAME#16rm : I<opc, MRMSrcMem, (outs GR16:$dst), + (ins GR16:$val, i16mem:$ptr), + !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"), + [(set + GR16:$dst, + (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))]>, + OpSize16; + def NAME#32rm : I<opc, MRMSrcMem, (outs GR32:$dst), + (ins GR32:$val, i32mem:$ptr), + !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"), + [(set + GR32:$dst, + (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))]>, + OpSize32; + def NAME#64rm : RI<opc, MRMSrcMem, (outs GR64:$dst), + (ins GR64:$val, i64mem:$ptr), + !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"), + [(set + GR64:$dst, + (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))]>; + } +} + +defm XCHG : ATOMIC_SWAP<0x86, 0x87, "xchg", "atomic_swap">, NotMemoryFoldable; + +// Swap between registers. +let SchedRW = [WriteALU] in { +let Constraints = "$src1 = $dst1, $src2 = $dst2", hasSideEffects = 0 in { +def XCHG8rr : I<0x86, MRMSrcReg, (outs GR8:$dst1, GR8:$dst2), + (ins GR8:$src1, GR8:$src2), + "xchg{b}\t{$src1, $src2|$src2, $src1}", []>, NotMemoryFoldable; +def XCHG16rr : I<0x87, MRMSrcReg, (outs GR16:$dst1, GR16:$dst2), + (ins GR16:$src1, GR16:$src2), + "xchg{w}\t{$src1, $src2|$src2, $src1}", []>, + OpSize16, NotMemoryFoldable; +def XCHG32rr : I<0x87, MRMSrcReg, (outs GR32:$dst1, GR32:$dst2), + (ins GR32:$src1, GR32:$src2), + "xchg{l}\t{$src1, $src2|$src2, $src1}", []>, + OpSize32, NotMemoryFoldable; +def XCHG64rr : RI<0x87, MRMSrcReg, (outs GR64:$dst1, GR64:$dst2), + (ins GR64:$src1 ,GR64:$src2), + "xchg{q}\t{$src1, $src2|$src2, $src1}", []>, NotMemoryFoldable; +} + +def NOOP19rr: I<0x19, MRMSrcReg, (outs), (ins GR32:$val, GR32:$src), + "nop\t{$val, $src|$src, $val}", []>, TB, + OpSize32; + +// Swap between EAX and other registers. +let Constraints = "$src = $dst", hasSideEffects = 0 in { +let Uses = [AX], Defs = [AX] in +def XCHG16ar : I<0x90, AddRegFrm, (outs GR16:$dst), (ins GR16:$src), + "xchg{w}\t{%ax, $src|$src, ax}", []>, OpSize16; +let Uses = [EAX], Defs = [EAX] in +def XCHG32ar : I<0x90, AddRegFrm, (outs GR32:$dst), (ins GR32:$src), + "xchg{l}\t{%eax, $src|$src, eax}", []>, OpSize32; +let Uses = [RAX], Defs = [RAX] in +def XCHG64ar : RI<0x90, AddRegFrm, (outs GR64:$dst), (ins GR64:$src), + "xchg{q}\t{%rax, $src|$src, rax}", []>; +} +} // SchedRW + +let hasSideEffects = 0, Constraints = "$src1 = $dst1, $src2 = $dst2", + Defs = [EFLAGS], SchedRW = [WriteALU] in { +def XADD8rr : I<0xC0, MRMDestReg, (outs GR8:$dst1, GR8:$dst2), + (ins GR8:$src1, GR8:$src2), + "xadd{b}\t{$src2, $src1|$src1, $src2}", []>, TB; +def XADD16rr : I<0xC1, MRMDestReg, (outs GR16:$dst1, GR16:$dst2), + (ins GR16:$src1, GR16:$src2), + "xadd{w}\t{$src2, $src1|$src1, $src2}", []>, TB, OpSize16; +def XADD32rr : I<0xC1, MRMDestReg, (outs GR32:$dst1, GR32:$dst2), + (ins GR32:$src1, GR32:$src2), + "xadd{l}\t{$src2, $src1|$src1, $src2}", []>, TB, OpSize32; +def XADD64rr : RI<0xC1, MRMDestReg, (outs GR64:$dst1, GR64:$dst2), + (ins GR64:$src1, GR64:$src2), + "xadd{q}\t{$src2, $src1|$src1, $src2}", []>, TB; +} // SchedRW + +let mayLoad = 1, mayStore = 1, hasSideEffects = 0, Constraints = "$val = $dst", + Defs = [EFLAGS], SchedRW = [WriteALULd, WriteRMW] in { +def XADD8rm : I<0xC0, MRMSrcMem, (outs GR8:$dst), + (ins GR8:$val, i8mem:$ptr), + "xadd{b}\t{$val, $ptr|$ptr, $val}", []>, TB; +def XADD16rm : I<0xC1, MRMSrcMem, (outs GR16:$dst), + (ins GR16:$val, i16mem:$ptr), + "xadd{w}\t{$val, $ptr|$ptr, $val}", []>, TB, + OpSize16; +def XADD32rm : I<0xC1, MRMSrcMem, (outs GR32:$dst), + (ins GR32:$val, i32mem:$ptr), + "xadd{l}\t{$val, $ptr|$ptr, $val}", []>, TB, + OpSize32; +def XADD64rm : RI<0xC1, MRMSrcMem, (outs GR64:$dst), + (ins GR64:$val, i64mem:$ptr), + "xadd{q}\t{$val, $ptr|$ptr, $val}", []>, TB; + +} + +let SchedRW = [WriteALU], hasSideEffects = 0 in { +let Defs = [AL, EFLAGS], Uses = [AL] in +def CMPXCHG8rr : I<0xB0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src), + "cmpxchg{b}\t{$src, $dst|$dst, $src}", []>, TB, + NotMemoryFoldable; +let Defs = [AX, EFLAGS], Uses = [AX] in +def CMPXCHG16rr : I<0xB1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src), + "cmpxchg{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize16, + NotMemoryFoldable; +let Defs = [EAX, EFLAGS], Uses = [EAX] in +def CMPXCHG32rr : I<0xB1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src), + "cmpxchg{l}\t{$src, $dst|$dst, $src}", []>, TB, OpSize32, + NotMemoryFoldable; +let Defs = [RAX, EFLAGS], Uses = [RAX] in +def CMPXCHG64rr : RI<0xB1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src), + "cmpxchg{q}\t{$src, $dst|$dst, $src}", []>, TB, + NotMemoryFoldable; +} // SchedRW, hasSideEffects + +let SchedRW = [WriteALULd, WriteRMW], mayLoad = 1, mayStore = 1, + hasSideEffects = 0 in { +let Defs = [AL, EFLAGS], Uses = [AL] in +def CMPXCHG8rm : I<0xB0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src), + "cmpxchg{b}\t{$src, $dst|$dst, $src}", []>, TB, + NotMemoryFoldable; +let Defs = [AX, EFLAGS], Uses = [AX] in +def CMPXCHG16rm : I<0xB1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src), + "cmpxchg{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize16, + NotMemoryFoldable; +let Defs = [EAX, EFLAGS], Uses = [EAX] in +def CMPXCHG32rm : I<0xB1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), + "cmpxchg{l}\t{$src, $dst|$dst, $src}", []>, TB, OpSize32, + NotMemoryFoldable; +let Defs = [RAX, EFLAGS], Uses = [RAX] in +def CMPXCHG64rm : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), + "cmpxchg{q}\t{$src, $dst|$dst, $src}", []>, TB, + NotMemoryFoldable; + +let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX] in +def CMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$dst), + "cmpxchg8b\t$dst", []>, TB; + +let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX] in +def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst), + "cmpxchg16b\t$dst", []>, + TB, Requires<[HasCmpxchg16b, In64BitMode]>; +} // SchedRW, mayLoad, mayStore, hasSideEffects + + +// Lock instruction prefix +let SchedRW = [WriteMicrocoded] in +def LOCK_PREFIX : I<0xF0, RawFrm, (outs), (ins), "lock", []>; + +let SchedRW = [WriteNop] in { + +// Rex64 instruction prefix +def REX64_PREFIX : I<0x48, RawFrm, (outs), (ins), "rex64", []>, + Requires<[In64BitMode]>; + +// Data16 instruction prefix +def DATA16_PREFIX : I<0x66, RawFrm, (outs), (ins), "data16", []>; +} // SchedRW + +// Repeat string operation instruction prefixes +let Defs = [ECX], Uses = [ECX,DF], SchedRW = [WriteMicrocoded] in { +// Repeat (used with INS, OUTS, MOVS, LODS and STOS) +def REP_PREFIX : I<0xF3, RawFrm, (outs), (ins), "rep", []>; +// Repeat while not equal (used with CMPS and SCAS) +def REPNE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "repne", []>; +} + +// String manipulation instructions +let SchedRW = [WriteMicrocoded] in { +let Defs = [AL,ESI], Uses = [ESI,DF] in +def LODSB : I<0xAC, RawFrmSrc, (outs), (ins srcidx8:$src), + "lodsb\t{$src, %al|al, $src}", []>; +let Defs = [AX,ESI], Uses = [ESI,DF] in +def LODSW : I<0xAD, RawFrmSrc, (outs), (ins srcidx16:$src), + "lodsw\t{$src, %ax|ax, $src}", []>, OpSize16; +let Defs = [EAX,ESI], Uses = [ESI,DF] in +def LODSL : I<0xAD, RawFrmSrc, (outs), (ins srcidx32:$src), + "lods{l|d}\t{$src, %eax|eax, $src}", []>, OpSize32; +let Defs = [RAX,ESI], Uses = [ESI,DF] in +def LODSQ : RI<0xAD, RawFrmSrc, (outs), (ins srcidx64:$src), + "lodsq\t{$src, %rax|rax, $src}", []>, + Requires<[In64BitMode]>; +} + +let SchedRW = [WriteSystem] in { +let Defs = [ESI], Uses = [DX,ESI,DF] in { +def OUTSB : I<0x6E, RawFrmSrc, (outs), (ins srcidx8:$src), + "outsb\t{$src, %dx|dx, $src}", []>; +def OUTSW : I<0x6F, RawFrmSrc, (outs), (ins srcidx16:$src), + "outsw\t{$src, %dx|dx, $src}", []>, OpSize16; +def OUTSL : I<0x6F, RawFrmSrc, (outs), (ins srcidx32:$src), + "outs{l|d}\t{$src, %dx|dx, $src}", []>, OpSize32; +} + +let Defs = [EDI], Uses = [DX,EDI,DF] in { +def INSB : I<0x6C, RawFrmDst, (outs), (ins dstidx8:$dst), + "insb\t{%dx, $dst|$dst, dx}", []>; +def INSW : I<0x6D, RawFrmDst, (outs), (ins dstidx16:$dst), + "insw\t{%dx, $dst|$dst, dx}", []>, OpSize16; +def INSL : I<0x6D, RawFrmDst, (outs), (ins dstidx32:$dst), + "ins{l|d}\t{%dx, $dst|$dst, dx}", []>, OpSize32; +} +} + +// EFLAGS management instructions. +let SchedRW = [WriteALU], Defs = [EFLAGS], Uses = [EFLAGS] in { +def CLC : I<0xF8, RawFrm, (outs), (ins), "clc", []>; +def STC : I<0xF9, RawFrm, (outs), (ins), "stc", []>; +def CMC : I<0xF5, RawFrm, (outs), (ins), "cmc", []>; +} + +// DF management instructions. +let SchedRW = [WriteALU], Defs = [DF] in { +def CLD : I<0xFC, RawFrm, (outs), (ins), "cld", []>; +def STD : I<0xFD, RawFrm, (outs), (ins), "std", []>; +} + +// Table lookup instructions +let Uses = [AL,EBX], Defs = [AL], hasSideEffects = 0, mayLoad = 1 in +def XLAT : I<0xD7, RawFrm, (outs), (ins), "xlatb", []>, Sched<[WriteLoad]>; + +let SchedRW = [WriteMicrocoded] in { +// ASCII Adjust After Addition +let Uses = [AL,EFLAGS], Defs = [AX,EFLAGS], hasSideEffects = 0 in +def AAA : I<0x37, RawFrm, (outs), (ins), "aaa", []>, + Requires<[Not64BitMode]>; + +// ASCII Adjust AX Before Division +let Uses = [AX], Defs = [AX,EFLAGS], hasSideEffects = 0 in +def AAD8i8 : Ii8<0xD5, RawFrm, (outs), (ins i8imm:$src), + "aad\t$src", []>, Requires<[Not64BitMode]>; + +// ASCII Adjust AX After Multiply +let Uses = [AL], Defs = [AX,EFLAGS], hasSideEffects = 0 in +def AAM8i8 : Ii8<0xD4, RawFrm, (outs), (ins i8imm:$src), + "aam\t$src", []>, Requires<[Not64BitMode]>; + +// ASCII Adjust AL After Subtraction - sets +let Uses = [AL,EFLAGS], Defs = [AX,EFLAGS], hasSideEffects = 0 in +def AAS : I<0x3F, RawFrm, (outs), (ins), "aas", []>, + Requires<[Not64BitMode]>; + +// Decimal Adjust AL after Addition +let Uses = [AL,EFLAGS], Defs = [AL,EFLAGS], hasSideEffects = 0 in +def DAA : I<0x27, RawFrm, (outs), (ins), "daa", []>, + Requires<[Not64BitMode]>; + +// Decimal Adjust AL after Subtraction +let Uses = [AL,EFLAGS], Defs = [AL,EFLAGS], hasSideEffects = 0 in +def DAS : I<0x2F, RawFrm, (outs), (ins), "das", []>, + Requires<[Not64BitMode]>; +} // SchedRW + +let SchedRW = [WriteSystem] in { +// Check Array Index Against Bounds +// Note: "bound" does not have reversed operands in at&t syntax. +def BOUNDS16rm : I<0x62, MRMSrcMem, (outs GR16:$dst), (ins i32mem:$src), + "bound\t$dst, $src", []>, OpSize16, + Requires<[Not64BitMode]>; +def BOUNDS32rm : I<0x62, MRMSrcMem, (outs GR32:$dst), (ins i64mem:$src), + "bound\t$dst, $src", []>, OpSize32, + Requires<[Not64BitMode]>; + +// Adjust RPL Field of Segment Selector +def ARPL16rr : I<0x63, MRMDestReg, (outs GR16:$dst), (ins GR16:$src), + "arpl\t{$src, $dst|$dst, $src}", []>, + Requires<[Not64BitMode]>, NotMemoryFoldable; +let mayStore = 1 in +def ARPL16mr : I<0x63, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src), + "arpl\t{$src, $dst|$dst, $src}", []>, + Requires<[Not64BitMode]>, NotMemoryFoldable; +} // SchedRW + +//===----------------------------------------------------------------------===// +// MOVBE Instructions +// +let Predicates = [HasMOVBE] in { + let SchedRW = [WriteALULd] in { + def MOVBE16rm : I<0xF0, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "movbe{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, (bswap (loadi16 addr:$src)))]>, + OpSize16, T8PS; + def MOVBE32rm : I<0xF0, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "movbe{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (bswap (loadi32 addr:$src)))]>, + OpSize32, T8PS; + def MOVBE64rm : RI<0xF0, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "movbe{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (bswap (loadi64 addr:$src)))]>, + T8PS; + } + let SchedRW = [WriteStore] in { + def MOVBE16mr : I<0xF1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src), + "movbe{w}\t{$src, $dst|$dst, $src}", + [(store (bswap GR16:$src), addr:$dst)]>, + OpSize16, T8PS; + def MOVBE32mr : I<0xF1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), + "movbe{l}\t{$src, $dst|$dst, $src}", + [(store (bswap GR32:$src), addr:$dst)]>, + OpSize32, T8PS; + def MOVBE64mr : RI<0xF1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), + "movbe{q}\t{$src, $dst|$dst, $src}", + [(store (bswap GR64:$src), addr:$dst)]>, + T8PS; + } +} + +//===----------------------------------------------------------------------===// +// RDRAND Instruction +// +let Predicates = [HasRDRAND], Defs = [EFLAGS], SchedRW = [WriteSystem] in { + def RDRAND16r : I<0xC7, MRM6r, (outs GR16:$dst), (ins), + "rdrand{w}\t$dst", [(set GR16:$dst, EFLAGS, (X86rdrand))]>, + OpSize16, PS; + def RDRAND32r : I<0xC7, MRM6r, (outs GR32:$dst), (ins), + "rdrand{l}\t$dst", [(set GR32:$dst, EFLAGS, (X86rdrand))]>, + OpSize32, PS; + def RDRAND64r : RI<0xC7, MRM6r, (outs GR64:$dst), (ins), + "rdrand{q}\t$dst", [(set GR64:$dst, EFLAGS, (X86rdrand))]>, + PS; +} + +//===----------------------------------------------------------------------===// +// RDSEED Instruction +// +let Predicates = [HasRDSEED], Defs = [EFLAGS], SchedRW = [WriteSystem] in { + def RDSEED16r : I<0xC7, MRM7r, (outs GR16:$dst), (ins), "rdseed{w}\t$dst", + [(set GR16:$dst, EFLAGS, (X86rdseed))]>, OpSize16, PS; + def RDSEED32r : I<0xC7, MRM7r, (outs GR32:$dst), (ins), "rdseed{l}\t$dst", + [(set GR32:$dst, EFLAGS, (X86rdseed))]>, OpSize32, PS; + def RDSEED64r : RI<0xC7, MRM7r, (outs GR64:$dst), (ins), "rdseed{q}\t$dst", + [(set GR64:$dst, EFLAGS, (X86rdseed))]>, PS; +} + +//===----------------------------------------------------------------------===// +// LZCNT Instruction +// +let Predicates = [HasLZCNT], Defs = [EFLAGS] in { + def LZCNT16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), + "lzcnt{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, (ctlz GR16:$src)), (implicit EFLAGS)]>, + XS, OpSize16, Sched<[WriteLZCNT]>; + def LZCNT16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "lzcnt{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, (ctlz (loadi16 addr:$src))), + (implicit EFLAGS)]>, XS, OpSize16, Sched<[WriteLZCNTLd]>; + + def LZCNT32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "lzcnt{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (ctlz GR32:$src)), (implicit EFLAGS)]>, + XS, OpSize32, Sched<[WriteLZCNT]>; + def LZCNT32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "lzcnt{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (ctlz (loadi32 addr:$src))), + (implicit EFLAGS)]>, XS, OpSize32, Sched<[WriteLZCNTLd]>; + + def LZCNT64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), + "lzcnt{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (ctlz GR64:$src)), (implicit EFLAGS)]>, + XS, Sched<[WriteLZCNT]>; + def LZCNT64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "lzcnt{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (ctlz (loadi64 addr:$src))), + (implicit EFLAGS)]>, XS, Sched<[WriteLZCNTLd]>; +} + +//===----------------------------------------------------------------------===// +// BMI Instructions +// +let Predicates = [HasBMI], Defs = [EFLAGS] in { + def TZCNT16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), + "tzcnt{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, (cttz GR16:$src)), (implicit EFLAGS)]>, + XS, OpSize16, Sched<[WriteTZCNT]>; + def TZCNT16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "tzcnt{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, (cttz (loadi16 addr:$src))), + (implicit EFLAGS)]>, XS, OpSize16, Sched<[WriteTZCNTLd]>; + + def TZCNT32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "tzcnt{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (cttz GR32:$src)), (implicit EFLAGS)]>, + XS, OpSize32, Sched<[WriteTZCNT]>; + def TZCNT32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "tzcnt{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (cttz (loadi32 addr:$src))), + (implicit EFLAGS)]>, XS, OpSize32, Sched<[WriteTZCNTLd]>; + + def TZCNT64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), + "tzcnt{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (cttz GR64:$src)), (implicit EFLAGS)]>, + XS, Sched<[WriteTZCNT]>; + def TZCNT64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "tzcnt{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (cttz (loadi64 addr:$src))), + (implicit EFLAGS)]>, XS, Sched<[WriteTZCNTLd]>; +} + +multiclass bmi_bls<string mnemonic, Format RegMRM, Format MemMRM, + RegisterClass RC, X86MemOperand x86memop> { +let hasSideEffects = 0 in { + def rr : I<0xF3, RegMRM, (outs RC:$dst), (ins RC:$src), + !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), []>, + T8PS, VEX_4V, Sched<[WriteALU]>; + let mayLoad = 1 in + def rm : I<0xF3, MemMRM, (outs RC:$dst), (ins x86memop:$src), + !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), []>, + T8PS, VEX_4V, Sched<[WriteALULd]>; +} +} + +let Predicates = [HasBMI], Defs = [EFLAGS] in { + defm BLSR32 : bmi_bls<"blsr{l}", MRM1r, MRM1m, GR32, i32mem>; + defm BLSR64 : bmi_bls<"blsr{q}", MRM1r, MRM1m, GR64, i64mem>, VEX_W; + defm BLSMSK32 : bmi_bls<"blsmsk{l}", MRM2r, MRM2m, GR32, i32mem>; + defm BLSMSK64 : bmi_bls<"blsmsk{q}", MRM2r, MRM2m, GR64, i64mem>, VEX_W; + defm BLSI32 : bmi_bls<"blsi{l}", MRM3r, MRM3m, GR32, i32mem>; + defm BLSI64 : bmi_bls<"blsi{q}", MRM3r, MRM3m, GR64, i64mem>, VEX_W; +} + +//===----------------------------------------------------------------------===// +// Pattern fragments to auto generate BMI instructions. +//===----------------------------------------------------------------------===// + +let Predicates = [HasBMI] in { + // FIXME: patterns for the load versions are not implemented + def : Pat<(and GR32:$src, (add GR32:$src, -1)), + (BLSR32rr GR32:$src)>; + def : Pat<(and GR64:$src, (add GR64:$src, -1)), + (BLSR64rr GR64:$src)>; + + def : Pat<(xor GR32:$src, (add GR32:$src, -1)), + (BLSMSK32rr GR32:$src)>; + def : Pat<(xor GR64:$src, (add GR64:$src, -1)), + (BLSMSK64rr GR64:$src)>; + + def : Pat<(and GR32:$src, (ineg GR32:$src)), + (BLSI32rr GR32:$src)>; + def : Pat<(and GR64:$src, (ineg GR64:$src)), + (BLSI64rr GR64:$src)>; +} + +multiclass bmi_bextr<bits<8> opc, string mnemonic, RegisterClass RC, + X86MemOperand x86memop, SDNode OpNode, + PatFrag ld_frag, X86FoldableSchedWrite Sched> { + def rr : I<opc, MRMSrcReg4VOp3, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (OpNode RC:$src1, RC:$src2)), (implicit EFLAGS)]>, + T8PS, VEX, Sched<[Sched]>; + def rm : I<opc, MRMSrcMem4VOp3, (outs RC:$dst), (ins x86memop:$src1, RC:$src2), + !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (OpNode (ld_frag addr:$src1), RC:$src2)), + (implicit EFLAGS)]>, T8PS, VEX, + Sched<[Sched.Folded, + // x86memop:$src1 + ReadDefault, ReadDefault, ReadDefault, ReadDefault, + ReadDefault, + // RC:$src2 + ReadAfterLd]>; +} + +let Predicates = [HasBMI], Defs = [EFLAGS] in { + defm BEXTR32 : bmi_bextr<0xF7, "bextr{l}", GR32, i32mem, + X86bextr, loadi32, WriteBEXTR>; + defm BEXTR64 : bmi_bextr<0xF7, "bextr{q}", GR64, i64mem, + X86bextr, loadi64, WriteBEXTR>, VEX_W; +} + +multiclass bmi_bzhi<bits<8> opc, string mnemonic, RegisterClass RC, + X86MemOperand x86memop, Intrinsic Int, + PatFrag ld_frag, X86FoldableSchedWrite Sched> { + def rr : I<opc, MRMSrcReg4VOp3, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (Int RC:$src1, RC:$src2)), (implicit EFLAGS)]>, + T8PS, VEX, Sched<[Sched]>; + def rm : I<opc, MRMSrcMem4VOp3, (outs RC:$dst), (ins x86memop:$src1, RC:$src2), + !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (Int (ld_frag addr:$src1), RC:$src2)), + (implicit EFLAGS)]>, T8PS, VEX, + Sched<[Sched.Folded, + // x86memop:$src1 + ReadDefault, ReadDefault, ReadDefault, ReadDefault, + ReadDefault, + // RC:$src2 + ReadAfterLd]>; +} + +let Predicates = [HasBMI2], Defs = [EFLAGS] in { + defm BZHI32 : bmi_bzhi<0xF5, "bzhi{l}", GR32, i32mem, + int_x86_bmi_bzhi_32, loadi32, WriteBZHI>; + defm BZHI64 : bmi_bzhi<0xF5, "bzhi{q}", GR64, i64mem, + int_x86_bmi_bzhi_64, loadi64, WriteBZHI>, VEX_W; +} + +def CountTrailingOnes : SDNodeXForm<imm, [{ + // Count the trailing ones in the immediate. + return getI8Imm(countTrailingOnes(N->getZExtValue()), SDLoc(N)); +}]>; + +def BEXTRMaskXForm : SDNodeXForm<imm, [{ + unsigned Length = countTrailingOnes(N->getZExtValue()); + return getI32Imm(Length << 8, SDLoc(N)); +}]>; + +def AndMask64 : ImmLeaf<i64, [{ + return isMask_64(Imm) && !isUInt<32>(Imm); +}]>; + +// Use BEXTR for 64-bit 'and' with large immediate 'mask'. +let Predicates = [HasBMI, NoBMI2, NoTBM] in { + def : Pat<(and GR64:$src, AndMask64:$mask), + (BEXTR64rr GR64:$src, + (SUBREG_TO_REG (i64 0), + (MOV32ri (BEXTRMaskXForm imm:$mask)), sub_32bit))>; + def : Pat<(and (loadi64 addr:$src), AndMask64:$mask), + (BEXTR64rm addr:$src, + (SUBREG_TO_REG (i64 0), + (MOV32ri (BEXTRMaskXForm imm:$mask)), sub_32bit))>; +} + +// Use BZHI for 64-bit 'and' with large immediate 'mask'. +let Predicates = [HasBMI2, NoTBM] in { + def : Pat<(and GR64:$src, AndMask64:$mask), + (BZHI64rr GR64:$src, + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + (MOV8ri (CountTrailingOnes imm:$mask)), sub_8bit))>; + def : Pat<(and (loadi64 addr:$src), AndMask64:$mask), + (BZHI64rm addr:$src, + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + (MOV8ri (CountTrailingOnes imm:$mask)), sub_8bit))>; +} + +let Predicates = [HasBMI2] in { + multiclass _bmi_bzhi_pattern<dag regpattern, dag mempattern, RegisterClass RC, + ValueType VT, Instruction DstInst, + Instruction DstMemInst> { + def : Pat<regpattern, + (DstInst RC:$src, + (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>; + def : Pat<mempattern, + (DstMemInst addr:$src, + (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>; + } + + multiclass bmi_bzhi_patterns<RegisterClass RC, int bitwidth, ValueType VT, + Instruction DstInst, X86MemOperand x86memop, + Instruction DstMemInst> { + // x & ((1 << y) - 1) + defm : _bmi_bzhi_pattern<(and RC:$src, (add (shl 1, GR8:$lz), -1)), + (and (x86memop addr:$src), + (add (shl 1, GR8:$lz), -1)), + RC, VT, DstInst, DstMemInst>; + + // x & ~(-1 << y) + defm : _bmi_bzhi_pattern<(and RC:$src, (xor (shl -1, GR8:$lz), -1)), + (and (x86memop addr:$src), + (xor (shl -1, GR8:$lz), -1)), + RC, VT, DstInst, DstMemInst>; + + // x & (-1 >> (bitwidth - y)) + defm : _bmi_bzhi_pattern<(and RC:$src, (srl -1, (sub bitwidth, GR8:$lz))), + (and (x86memop addr:$src), + (srl -1, (sub bitwidth, GR8:$lz))), + RC, VT, DstInst, DstMemInst>; + + // x << (bitwidth - y) >> (bitwidth - y) + defm : _bmi_bzhi_pattern<(srl (shl RC:$src, (sub bitwidth, GR8:$lz)), + (sub bitwidth, GR8:$lz)), + (srl (shl (x86memop addr:$src), + (sub bitwidth, GR8:$lz)), + (sub bitwidth, GR8:$lz)), + RC, VT, DstInst, DstMemInst>; + } + + defm : bmi_bzhi_patterns<GR32, 32, i32, BZHI32rr, loadi32, BZHI32rm>; + defm : bmi_bzhi_patterns<GR64, 64, i64, BZHI64rr, loadi64, BZHI64rm>; + + // x & (-1 >> (32 - y)) + def : Pat<(and GR32:$src, (srl -1, (i8 (trunc (sub 32, GR32:$lz))))), + (BZHI32rr GR32:$src, GR32:$lz)>; + def : Pat<(and (loadi32 addr:$src), (srl -1, (i8 (trunc (sub 32, GR32:$lz))))), + (BZHI32rm addr:$src, GR32:$lz)>; + + // x & (-1 >> (64 - y)) + def : Pat<(and GR64:$src, (srl -1, (i8 (trunc (sub 64, GR32:$lz))))), + (BZHI64rr GR64:$src, + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>; + def : Pat<(and (loadi64 addr:$src), (srl -1, (i8 (trunc (sub 64, GR32:$lz))))), + (BZHI64rm addr:$src, + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>; + + // x << (32 - y) >> (32 - y) + def : Pat<(srl (shl GR32:$src, (i8 (trunc (sub 32, GR32:$lz)))), + (i8 (trunc (sub 32, GR32:$lz)))), + (BZHI32rr GR32:$src, GR32:$lz)>; + def : Pat<(srl (shl (loadi32 addr:$src), (i8 (trunc (sub 32, GR32:$lz)))), + (i8 (trunc (sub 32, GR32:$lz)))), + (BZHI32rm addr:$src, GR32:$lz)>; + + // x << (64 - y) >> (64 - y) + def : Pat<(srl (shl GR64:$src, (i8 (trunc (sub 64, GR32:$lz)))), + (i8 (trunc (sub 64, GR32:$lz)))), + (BZHI64rr GR64:$src, + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>; + def : Pat<(srl (shl (loadi64 addr:$src), (i8 (trunc (sub 64, GR32:$lz)))), + (i8 (trunc (sub 64, GR32:$lz)))), + (BZHI64rm addr:$src, + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>; +} // HasBMI2 + +multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC, + X86MemOperand x86memop, Intrinsic Int, + PatFrag ld_frag> { + def rr : I<0xF5, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (Int RC:$src1, RC:$src2))]>, + VEX_4V, Sched<[WriteALU]>; + def rm : I<0xF5, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2)))]>, + VEX_4V, Sched<[WriteALULd, ReadAfterLd]>; +} + +let Predicates = [HasBMI2] in { + defm PDEP32 : bmi_pdep_pext<"pdep{l}", GR32, i32mem, + int_x86_bmi_pdep_32, loadi32>, T8XD; + defm PDEP64 : bmi_pdep_pext<"pdep{q}", GR64, i64mem, + int_x86_bmi_pdep_64, loadi64>, T8XD, VEX_W; + defm PEXT32 : bmi_pdep_pext<"pext{l}", GR32, i32mem, + int_x86_bmi_pext_32, loadi32>, T8XS; + defm PEXT64 : bmi_pdep_pext<"pext{q}", GR64, i64mem, + int_x86_bmi_pext_64, loadi64>, T8XS, VEX_W; +} + +//===----------------------------------------------------------------------===// +// TBM Instructions +// +let Predicates = [HasTBM], Defs = [EFLAGS] in { + +multiclass tbm_ternary_imm<bits<8> opc, RegisterClass RC, string OpcodeStr, + X86MemOperand x86memop, PatFrag ld_frag, + SDNode OpNode, Operand immtype, + SDPatternOperator immoperator, + X86FoldableSchedWrite Sched> { + def ri : Ii32<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, immtype:$cntl), + !strconcat(OpcodeStr, + "\t{$cntl, $src1, $dst|$dst, $src1, $cntl}"), + [(set RC:$dst, (OpNode RC:$src1, immoperator:$cntl))]>, + XOP, XOPA, Sched<[Sched]>; + def mi : Ii32<opc, MRMSrcMem, (outs RC:$dst), + (ins x86memop:$src1, immtype:$cntl), + !strconcat(OpcodeStr, + "\t{$cntl, $src1, $dst|$dst, $src1, $cntl}"), + [(set RC:$dst, (OpNode (ld_frag addr:$src1), immoperator:$cntl))]>, + XOP, XOPA, Sched<[Sched.Folded]>; +} + +defm BEXTRI32 : tbm_ternary_imm<0x10, GR32, "bextr{l}", i32mem, loadi32, + X86bextr, i32imm, imm, WriteBEXTR>; +let ImmT = Imm32S in +defm BEXTRI64 : tbm_ternary_imm<0x10, GR64, "bextr{q}", i64mem, loadi64, + X86bextr, i64i32imm, + i64immSExt32, WriteBEXTR>, VEX_W; + +multiclass tbm_binary_rm<bits<8> opc, Format FormReg, Format FormMem, + RegisterClass RC, string OpcodeStr, + X86MemOperand x86memop, X86FoldableSchedWrite Sched> { +let hasSideEffects = 0 in { + def rr : I<opc, FormReg, (outs RC:$dst), (ins RC:$src), + !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"), []>, + XOP_4V, XOP9, Sched<[Sched]>; + let mayLoad = 1 in + def rm : I<opc, FormMem, (outs RC:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"), []>, + XOP_4V, XOP9, Sched<[Sched.Folded]>; +} +} + +multiclass tbm_binary_intr<bits<8> opc, string OpcodeStr, + X86FoldableSchedWrite Sched, + Format FormReg, Format FormMem> { + defm NAME#32 : tbm_binary_rm<opc, FormReg, FormMem, GR32, OpcodeStr#"{l}", + i32mem, Sched>; + defm NAME#64 : tbm_binary_rm<opc, FormReg, FormMem, GR64, OpcodeStr#"{q}", + i64mem, Sched>, VEX_W; +} + +defm BLCFILL : tbm_binary_intr<0x01, "blcfill", WriteALU, MRM1r, MRM1m>; +defm BLCI : tbm_binary_intr<0x02, "blci", WriteALU, MRM6r, MRM6m>; +defm BLCIC : tbm_binary_intr<0x01, "blcic", WriteALU, MRM5r, MRM5m>; +defm BLCMSK : tbm_binary_intr<0x02, "blcmsk", WriteALU, MRM1r, MRM1m>; +defm BLCS : tbm_binary_intr<0x01, "blcs", WriteALU, MRM3r, MRM3m>; +defm BLSFILL : tbm_binary_intr<0x01, "blsfill", WriteALU, MRM2r, MRM2m>; +defm BLSIC : tbm_binary_intr<0x01, "blsic", WriteALU, MRM6r, MRM6m>; +defm T1MSKC : tbm_binary_intr<0x01, "t1mskc", WriteALU, MRM7r, MRM7m>; +defm TZMSK : tbm_binary_intr<0x01, "tzmsk", WriteALU, MRM4r, MRM4m>; +} // HasTBM, EFLAGS + +// Use BEXTRI for 64-bit 'and' with large immediate 'mask'. +let Predicates = [HasTBM] in { + def : Pat<(and GR64:$src, AndMask64:$mask), + (BEXTRI64ri GR64:$src, (BEXTRMaskXForm imm:$mask))>; + + def : Pat<(and (loadi64 addr:$src), AndMask64:$mask), + (BEXTRI64mi addr:$src, (BEXTRMaskXForm imm:$mask))>; +} + +//===----------------------------------------------------------------------===// +// Lightweight Profiling Instructions + +let Predicates = [HasLWP], SchedRW = [WriteSystem] in { + +def LLWPCB : I<0x12, MRM0r, (outs), (ins GR32:$src), "llwpcb\t$src", + [(int_x86_llwpcb GR32:$src)]>, XOP, XOP9; +def SLWPCB : I<0x12, MRM1r, (outs GR32:$dst), (ins), "slwpcb\t$dst", + [(set GR32:$dst, (int_x86_slwpcb))]>, XOP, XOP9; + +def LLWPCB64 : I<0x12, MRM0r, (outs), (ins GR64:$src), "llwpcb\t$src", + [(int_x86_llwpcb GR64:$src)]>, XOP, XOP9, VEX_W; +def SLWPCB64 : I<0x12, MRM1r, (outs GR64:$dst), (ins), "slwpcb\t$dst", + [(set GR64:$dst, (int_x86_slwpcb))]>, XOP, XOP9, VEX_W; + +multiclass lwpins_intr<RegisterClass RC> { + def rri : Ii32<0x12, MRM0r, (outs), (ins RC:$src0, GR32:$src1, i32imm:$cntl), + "lwpins\t{$cntl, $src1, $src0|$src0, $src1, $cntl}", + [(set EFLAGS, (X86lwpins RC:$src0, GR32:$src1, imm:$cntl))]>, + XOP_4V, XOPA; + let mayLoad = 1 in + def rmi : Ii32<0x12, MRM0m, (outs), (ins RC:$src0, i32mem:$src1, i32imm:$cntl), + "lwpins\t{$cntl, $src1, $src0|$src0, $src1, $cntl}", + [(set EFLAGS, (X86lwpins RC:$src0, (loadi32 addr:$src1), imm:$cntl))]>, + XOP_4V, XOPA; +} + +let Defs = [EFLAGS] in { + defm LWPINS32 : lwpins_intr<GR32>; + defm LWPINS64 : lwpins_intr<GR64>, VEX_W; +} // EFLAGS + +multiclass lwpval_intr<RegisterClass RC, Intrinsic Int> { + def rri : Ii32<0x12, MRM1r, (outs), (ins RC:$src0, GR32:$src1, i32imm:$cntl), + "lwpval\t{$cntl, $src1, $src0|$src0, $src1, $cntl}", + [(Int RC:$src0, GR32:$src1, imm:$cntl)]>, XOP_4V, XOPA; + let mayLoad = 1 in + def rmi : Ii32<0x12, MRM1m, (outs), (ins RC:$src0, i32mem:$src1, i32imm:$cntl), + "lwpval\t{$cntl, $src1, $src0|$src0, $src1, $cntl}", + [(Int RC:$src0, (loadi32 addr:$src1), imm:$cntl)]>, + XOP_4V, XOPA; +} + +defm LWPVAL32 : lwpval_intr<GR32, int_x86_lwpval32>; +defm LWPVAL64 : lwpval_intr<GR64, int_x86_lwpval64>, VEX_W; + +} // HasLWP, SchedRW + +//===----------------------------------------------------------------------===// +// MONITORX/MWAITX Instructions +// +let SchedRW = [ WriteSystem ] in { +/* + let usesCustomInserter = 1 in { + def MONITORX : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3), + [(int_x86_monitorx addr:$src1, GR32:$src2, GR32:$src3)]>, + Requires<[ HasMWAITX ]>; + } +*/ + + let Uses = [ EAX, ECX, EDX ] in { + def MONITORXrrr : I<0x01, MRM_FA, (outs), (ins), "monitorx", []>, + TB, Requires<[ HasMWAITX ]>; + } + + let Uses = [ ECX, EAX, EBX ] in { + def MWAITXrrr : I<0x01, MRM_FB, (outs), (ins), "mwaitx", + [(int_x86_mwaitx ECX, EAX, EBX)]>, + TB, Requires<[ HasMWAITX ]>; + } +} // SchedRW + +def : InstAlias<"mwaitx\t{%eax, %ecx, %ebx|ebx, ecx, eax}", (MWAITXrrr)>, + Requires<[ Not64BitMode ]>; +def : InstAlias<"mwaitx\t{%rax, %rcx, %rbx|rbx, rcx, rax}", (MWAITXrrr)>, + Requires<[ In64BitMode ]>; + +def : InstAlias<"monitorx\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORXrrr)>, + Requires<[ Not64BitMode ]>; +def : InstAlias<"monitorx\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORXrrr)>, + Requires<[ In64BitMode ]>; + +//===----------------------------------------------------------------------===// +// WAITPKG Instructions +// +let SchedRW = [WriteSystem] in { + def UMONITOR16 : I<0xAE, MRM6r, (outs), (ins GR16:$src), + "umonitor\t$src", [(int_x86_umonitor GR16:$src)]>, + XS, AdSize16, Requires<[HasWAITPKG, Not64BitMode]>; + def UMONITOR32 : I<0xAE, MRM6r, (outs), (ins GR32:$src), + "umonitor\t$src", [(int_x86_umonitor GR32:$src)]>, + XS, AdSize32, Requires<[HasWAITPKG]>; + def UMONITOR64 : I<0xAE, MRM6r, (outs), (ins GR64:$src), + "umonitor\t$src", [(int_x86_umonitor GR64:$src)]>, + XS, AdSize64, Requires<[HasWAITPKG, In64BitMode]>; + let Uses = [EAX, EDX], Defs = [EFLAGS] in { + def UMWAIT : I<0xAE, MRM6r, + (outs), (ins GR32orGR64:$src), "umwait\t$src", + [(set EFLAGS, (X86umwait GR32orGR64:$src, EDX, EAX))]>, + XD, Requires<[HasWAITPKG]>; + def TPAUSE : I<0xAE, MRM6r, + (outs), (ins GR32orGR64:$src), "tpause\t$src", + [(set EFLAGS, (X86tpause GR32orGR64:$src, EDX, EAX))]>, + PD, Requires<[HasWAITPKG]>, NotMemoryFoldable; + } +} // SchedRW + +//===----------------------------------------------------------------------===// +// MOVDIRI - Move doubleword/quadword as direct store +// +let SchedRW = [WriteStore] in { +def MOVDIRI32 : I<0xF9, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), + "movdiri\t{$src, $dst|$dst, $src}", + [(int_x86_directstore32 addr:$dst, GR32:$src)]>, + T8, Requires<[HasMOVDIRI]>; +def MOVDIRI64 : RI<0xF9, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), + "movdiri\t{$src, $dst|$dst, $src}", + [(int_x86_directstore64 addr:$dst, GR64:$src)]>, + T8, Requires<[In64BitMode, HasMOVDIRI]>; +} // SchedRW + +//===----------------------------------------------------------------------===// +// MOVDIR64B - Move 64 bytes as direct store +// +let SchedRW = [WriteStore] in { +def MOVDIR64B16 : I<0xF8, MRMSrcMem, (outs), (ins GR16:$dst, i512mem:$src), + "movdir64b\t{$src, $dst|$dst, $src}", []>, + T8PD, AdSize16, Requires<[HasMOVDIR64B, Not64BitMode]>; +def MOVDIR64B32 : I<0xF8, MRMSrcMem, (outs), (ins GR32:$dst, i512mem:$src), + "movdir64b\t{$src, $dst|$dst, $src}", + [(int_x86_movdir64b GR32:$dst, addr:$src)]>, + T8PD, AdSize32, Requires<[HasMOVDIR64B]>; +def MOVDIR64B64 : I<0xF8, MRMSrcMem, (outs), (ins GR64:$dst, i512mem:$src), + "movdir64b\t{$src, $dst|$dst, $src}", + [(int_x86_movdir64b GR64:$dst, addr:$src)]>, + T8PD, AdSize64, Requires<[HasMOVDIR64B, In64BitMode]>; +} // SchedRW + +//===----------------------------------------------------------------------===// +// CLZERO Instruction +// +let SchedRW = [WriteSystem] in { + let Uses = [EAX] in + def CLZEROr : I<0x01, MRM_FC, (outs), (ins), "clzero", []>, + TB, Requires<[HasCLZERO]>; + +/* + let usesCustomInserter = 1 in { + def CLZERO : PseudoI<(outs), (ins i32mem:$src1), + [(int_x86_clzero addr:$src1)]>, Requires<[HasCLZERO]>; + } +*/ +} // SchedRW + +def : InstAlias<"clzero\t{%eax|eax}", (CLZEROr)>, Requires<[Not64BitMode]>; +def : InstAlias<"clzero\t{%rax|rax}", (CLZEROr)>, Requires<[In64BitMode]>; + +//===----------------------------------------------------------------------===// +// Pattern fragments to auto generate TBM instructions. +//===----------------------------------------------------------------------===// + +let Predicates = [HasTBM] in { + // FIXME: patterns for the load versions are not implemented + def : Pat<(and GR32:$src, (add GR32:$src, 1)), + (BLCFILL32rr GR32:$src)>; + def : Pat<(and GR64:$src, (add GR64:$src, 1)), + (BLCFILL64rr GR64:$src)>; + + def : Pat<(or GR32:$src, (not (add GR32:$src, 1))), + (BLCI32rr GR32:$src)>; + def : Pat<(or GR64:$src, (not (add GR64:$src, 1))), + (BLCI64rr GR64:$src)>; + + // Extra patterns because opt can optimize the above patterns to this. + def : Pat<(or GR32:$src, (sub -2, GR32:$src)), + (BLCI32rr GR32:$src)>; + def : Pat<(or GR64:$src, (sub -2, GR64:$src)), + (BLCI64rr GR64:$src)>; + + def : Pat<(and (not GR32:$src), (add GR32:$src, 1)), + (BLCIC32rr GR32:$src)>; + def : Pat<(and (not GR64:$src), (add GR64:$src, 1)), + (BLCIC64rr GR64:$src)>; + + def : Pat<(xor GR32:$src, (add GR32:$src, 1)), + (BLCMSK32rr GR32:$src)>; + def : Pat<(xor GR64:$src, (add GR64:$src, 1)), + (BLCMSK64rr GR64:$src)>; + + def : Pat<(or GR32:$src, (add GR32:$src, 1)), + (BLCS32rr GR32:$src)>; + def : Pat<(or GR64:$src, (add GR64:$src, 1)), + (BLCS64rr GR64:$src)>; + + def : Pat<(or GR32:$src, (add GR32:$src, -1)), + (BLSFILL32rr GR32:$src)>; + def : Pat<(or GR64:$src, (add GR64:$src, -1)), + (BLSFILL64rr GR64:$src)>; + + def : Pat<(or (not GR32:$src), (add GR32:$src, -1)), + (BLSIC32rr GR32:$src)>; + def : Pat<(or (not GR64:$src), (add GR64:$src, -1)), + (BLSIC64rr GR64:$src)>; + + def : Pat<(or (not GR32:$src), (add GR32:$src, 1)), + (T1MSKC32rr GR32:$src)>; + def : Pat<(or (not GR64:$src), (add GR64:$src, 1)), + (T1MSKC64rr GR64:$src)>; + + def : Pat<(and (not GR32:$src), (add GR32:$src, -1)), + (TZMSK32rr GR32:$src)>; + def : Pat<(and (not GR64:$src), (add GR64:$src, -1)), + (TZMSK64rr GR64:$src)>; +} // HasTBM + +//===----------------------------------------------------------------------===// +// Memory Instructions +// + +let Predicates = [HasCLFLUSHOPT], SchedRW = [WriteLoad] in +def CLFLUSHOPT : I<0xAE, MRM7m, (outs), (ins i8mem:$src), + "clflushopt\t$src", [(int_x86_clflushopt addr:$src)]>, PD; + +let Predicates = [HasCLWB], SchedRW = [WriteLoad] in +def CLWB : I<0xAE, MRM6m, (outs), (ins i8mem:$src), "clwb\t$src", + [(int_x86_clwb addr:$src)]>, PD, NotMemoryFoldable; + +let Predicates = [HasCLDEMOTE], SchedRW = [WriteLoad] in +def CLDEMOTE : I<0x1C, MRM0m, (outs), (ins i8mem:$src), "cldemote\t$src", + [(int_x86_cldemote addr:$src)]>, TB; + +//===----------------------------------------------------------------------===// +// Subsystems. +//===----------------------------------------------------------------------===// + +include "X86Capstone.td" + +include "X86InstrArithmetic.td" +include "X86InstrCMovSetCC.td" +include "X86InstrExtension.td" +include "X86InstrControl.td" +include "X86InstrShiftRotate.td" + +// X87 Floating Point Stack. +include "X86InstrFPStack.td" + +// SIMD support (SSE, MMX and AVX) +include "X86InstrFragmentsSIMD.td" + +// FMA - Fused Multiply-Add support (requires FMA) +include "X86InstrFMA.td" + +// XOP +include "X86InstrXOP.td" + +// SSE, MMX and 3DNow! vector support. +include "X86InstrSSE.td" +include "X86InstrAVX512.td" +include "X86InstrMMX.td" +include "X86Instr3DNow.td" + +// MPX instructions +include "X86InstrMPX.td" + +include "X86InstrVMX.td" +include "X86InstrSVM.td" + +include "X86InstrTSX.td" +include "X86InstrSGX.td" + +// System instructions. +include "X86InstrSystem.td" + +// Compiler Pseudo Instructions and Pat Patterns +//include "X86InstrCompiler.td" +//include "X86InstrVecCompiler.td" + +//===----------------------------------------------------------------------===// +// Assembler Mnemonic Aliases +//===----------------------------------------------------------------------===// + +def : MnemonicAlias<"call", "callw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"call", "calll", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"call", "callq", "att">, Requires<[In64BitMode]>; + +def : MnemonicAlias<"cbw", "cbtw", "att">; +def : MnemonicAlias<"cwde", "cwtl", "att">; +def : MnemonicAlias<"cwd", "cwtd", "att">; +def : MnemonicAlias<"cdq", "cltd", "att">; +def : MnemonicAlias<"cdqe", "cltq", "att">; +def : MnemonicAlias<"cqo", "cqto", "att">; + +// In 64-bit mode lret maps to lretl; it is not ambiguous with lretq. +def : MnemonicAlias<"lret", "lretw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"lret", "lretl", "att">, Requires<[Not16BitMode]>; + +def : MnemonicAlias<"leavel", "leave", "att">, Requires<[Not64BitMode]>; +def : MnemonicAlias<"leaveq", "leave", "att">, Requires<[In64BitMode]>; + +def : MnemonicAlias<"loopz", "loope">; +def : MnemonicAlias<"loopnz", "loopne">; + +def : MnemonicAlias<"pop", "popw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"pop", "popl", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"pop", "popq", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"popf", "popfw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"popf", "popfl", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"popf", "popfq", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"popf", "popfq", "intel">, Requires<[In64BitMode]>; +def : MnemonicAlias<"popfd", "popfl", "att">; + +// FIXME: This is wrong for "push reg". "push %bx" should turn into pushw in +// all modes. However: "push (addr)" and "push $42" should default to +// pushl/pushq depending on the current mode. Similar for "pop %bx" +def : MnemonicAlias<"push", "pushw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"push", "pushl", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"push", "pushq", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"pushf", "pushfw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"pushf", "pushfl", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"pushf", "pushfq", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"pushf", "pushfq", "intel">, Requires<[In64BitMode]>; +def : MnemonicAlias<"pushfd", "pushfl", "att">; + +def : MnemonicAlias<"popad", "popal", "intel">, Requires<[Not64BitMode]>; +def : MnemonicAlias<"pushad", "pushal", "intel">, Requires<[Not64BitMode]>; +def : MnemonicAlias<"popa", "popaw", "intel">, Requires<[In16BitMode]>; +def : MnemonicAlias<"pusha", "pushaw", "intel">, Requires<[In16BitMode]>; +def : MnemonicAlias<"popa", "popal", "intel">, Requires<[In32BitMode]>; +def : MnemonicAlias<"pusha", "pushal", "intel">, Requires<[In32BitMode]>; + +def : MnemonicAlias<"popa", "popaw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"pusha", "pushaw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"popa", "popal", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"pusha", "pushal", "att">, Requires<[In32BitMode]>; + +def : MnemonicAlias<"repe", "rep">; +def : MnemonicAlias<"repz", "rep">; +def : MnemonicAlias<"repnz", "repne">; + +def : MnemonicAlias<"ret", "retw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"ret", "retl", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"ret", "retq", "att">, Requires<[In64BitMode]>; + +// Apply 'ret' behavior to 'retn' +def : MnemonicAlias<"retn", "retw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"retn", "retl", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"retn", "retq", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"retn", "ret", "intel">; + +def : MnemonicAlias<"sal", "shl", "intel">; +def : MnemonicAlias<"salb", "shlb", "att">; +def : MnemonicAlias<"salw", "shlw", "att">; +def : MnemonicAlias<"sall", "shll", "att">; +def : MnemonicAlias<"salq", "shlq", "att">; + +def : MnemonicAlias<"smovb", "movsb", "att">; +def : MnemonicAlias<"smovw", "movsw", "att">; +def : MnemonicAlias<"smovl", "movsl", "att">; +def : MnemonicAlias<"smovq", "movsq", "att">; + +def : MnemonicAlias<"ud2a", "ud2", "att">; +def : MnemonicAlias<"verrw", "verr", "att">; + +// MS recognizes 'xacquire'/'xrelease' as 'acquire'/'release' +def : MnemonicAlias<"acquire", "xacquire", "intel">; +def : MnemonicAlias<"release", "xrelease", "intel">; + +// System instruction aliases. +def : MnemonicAlias<"iret", "iretw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"iret", "iretl", "att">, Requires<[Not16BitMode]>; +def : MnemonicAlias<"sysret", "sysretl", "att">; +def : MnemonicAlias<"sysexit", "sysexitl", "att">; + +def : MnemonicAlias<"lgdt", "lgdtw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"lgdt", "lgdtl", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"lgdt", "lgdtq", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"lidt", "lidtw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"lidt", "lidtl", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"lidt", "lidtq", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"sgdt", "sgdtw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"sgdt", "sgdtl", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"sgdt", "sgdtq", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"sidt", "sidtw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"sidt", "sidtl", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"sidt", "sidtq", "att">, Requires<[In64BitMode]>; +//def : MnemonicAlias<"lgdt", "lgdtw", "intel">, Requires<[In16BitMode]>; +//def : MnemonicAlias<"lgdt", "lgdtd", "intel">, Requires<[In32BitMode]>; +//def : MnemonicAlias<"lidt", "lidtw", "intel">, Requires<[In16BitMode]>; +//def : MnemonicAlias<"lidt", "lidtd", "intel">, Requires<[In32BitMode]>; +//def : MnemonicAlias<"sgdt", "sgdtw", "intel">, Requires<[In16BitMode]>; +//def : MnemonicAlias<"sgdt", "sgdtd", "intel">, Requires<[In32BitMode]>; +//def : MnemonicAlias<"sidt", "sidtw", "intel">, Requires<[In16BitMode]>; +//def : MnemonicAlias<"sidt", "sidtd", "intel">, Requires<[In32BitMode]>; + + +// Floating point stack aliases. +def : MnemonicAlias<"fcmovz", "fcmove", "att">; +def : MnemonicAlias<"fcmova", "fcmovnbe", "att">; +def : MnemonicAlias<"fcmovnae", "fcmovb", "att">; +def : MnemonicAlias<"fcmovna", "fcmovbe", "att">; +def : MnemonicAlias<"fcmovae", "fcmovnb", "att">; +def : MnemonicAlias<"fcomip", "fcompi">; +def : MnemonicAlias<"fildq", "fildll", "att">; +def : MnemonicAlias<"fistpq", "fistpll", "att">; +def : MnemonicAlias<"fisttpq", "fisttpll", "att">; +def : MnemonicAlias<"fldcww", "fldcw", "att">; +def : MnemonicAlias<"fnstcww", "fnstcw", "att">; +def : MnemonicAlias<"fnstsww", "fnstsw", "att">; +def : MnemonicAlias<"fucomip", "fucompi">; +def : MnemonicAlias<"fwait", "wait">; + +def : MnemonicAlias<"fxsaveq", "fxsave64", "att">; +def : MnemonicAlias<"fxrstorq", "fxrstor64", "att">; +def : MnemonicAlias<"xsaveq", "xsave64", "att">; +def : MnemonicAlias<"xrstorq", "xrstor64", "att">; +def : MnemonicAlias<"xsaveoptq", "xsaveopt64", "att">; +def : MnemonicAlias<"xrstorsq", "xrstors64", "att">; +def : MnemonicAlias<"xsavecq", "xsavec64", "att">; +def : MnemonicAlias<"xsavesq", "xsaves64", "att">; + +class CondCodeAlias<string Prefix,string Suffix, string OldCond, string NewCond, + string VariantName> + : MnemonicAlias<!strconcat(Prefix, OldCond, Suffix), + !strconcat(Prefix, NewCond, Suffix), VariantName>; + +/// IntegerCondCodeMnemonicAlias - This multiclass defines a bunch of +/// MnemonicAlias's that canonicalize the condition code in a mnemonic, for +/// example "setz" -> "sete". +multiclass IntegerCondCodeMnemonicAlias<string Prefix, string Suffix, + string V = ""> { + def C : CondCodeAlias<Prefix, Suffix, "c", "b", V>; // setc -> setb + def Z : CondCodeAlias<Prefix, Suffix, "z" , "e", V>; // setz -> sete + def NA : CondCodeAlias<Prefix, Suffix, "na", "be", V>; // setna -> setbe + def NB : CondCodeAlias<Prefix, Suffix, "nb", "ae", V>; // setnb -> setae + def NC : CondCodeAlias<Prefix, Suffix, "nc", "ae", V>; // setnc -> setae + def NG : CondCodeAlias<Prefix, Suffix, "ng", "le", V>; // setng -> setle + def NL : CondCodeAlias<Prefix, Suffix, "nl", "ge", V>; // setnl -> setge + def NZ : CondCodeAlias<Prefix, Suffix, "nz", "ne", V>; // setnz -> setne + def PE : CondCodeAlias<Prefix, Suffix, "pe", "p", V>; // setpe -> setp + def PO : CondCodeAlias<Prefix, Suffix, "po", "np", V>; // setpo -> setnp + + def NAE : CondCodeAlias<Prefix, Suffix, "nae", "b", V>; // setnae -> setb + def NBE : CondCodeAlias<Prefix, Suffix, "nbe", "a", V>; // setnbe -> seta + def NGE : CondCodeAlias<Prefix, Suffix, "nge", "l", V>; // setnge -> setl + def NLE : CondCodeAlias<Prefix, Suffix, "nle", "g", V>; // setnle -> setg +} + +// Aliases for set<CC> +defm : IntegerCondCodeMnemonicAlias<"set", "">; +// Aliases for j<CC> +defm : IntegerCondCodeMnemonicAlias<"j", "">; +// Aliases for cmov<CC>{w,l,q} +defm : IntegerCondCodeMnemonicAlias<"cmov", "w", "att">; +defm : IntegerCondCodeMnemonicAlias<"cmov", "l", "att">; +defm : IntegerCondCodeMnemonicAlias<"cmov", "q", "att">; +// No size suffix for intel-style asm. +defm : IntegerCondCodeMnemonicAlias<"cmov", "", "intel">; + + +//===----------------------------------------------------------------------===// +// Assembler Instruction Aliases +//===----------------------------------------------------------------------===// + +// aad/aam default to base 10 if no operand is specified. +def : InstAlias<"aad", (AAD8i8 10)>, Requires<[Not64BitMode]>; +def : InstAlias<"aam", (AAM8i8 10)>, Requires<[Not64BitMode]>; + +// Disambiguate the mem/imm form of bt-without-a-suffix as btl. +// Likewise for btc/btr/bts. +def : InstAlias<"bt\t{$imm, $mem|$mem, $imm}", + (BT32mi8 i32mem:$mem, i32i8imm:$imm), 0, "att">; +def : InstAlias<"btc\t{$imm, $mem|$mem, $imm}", + (BTC32mi8 i32mem:$mem, i32i8imm:$imm), 0, "att">; +def : InstAlias<"btr\t{$imm, $mem|$mem, $imm}", + (BTR32mi8 i32mem:$mem, i32i8imm:$imm), 0, "att">; +def : InstAlias<"bts\t{$imm, $mem|$mem, $imm}", + (BTS32mi8 i32mem:$mem, i32i8imm:$imm), 0, "att">; + +// clr aliases. +def : InstAlias<"clr{b}\t$reg", (XOR8rr GR8 :$reg, GR8 :$reg), 0>; +def : InstAlias<"clr{w}\t$reg", (XOR16rr GR16:$reg, GR16:$reg), 0>; +def : InstAlias<"clr{l}\t$reg", (XOR32rr GR32:$reg, GR32:$reg), 0>; +def : InstAlias<"clr{q}\t$reg", (XOR64rr GR64:$reg, GR64:$reg), 0>; + +// lods aliases. Accept the destination being omitted because it's implicit +// in the mnemonic, or the mnemonic suffix being omitted because it's implicit +// in the destination. +def : InstAlias<"lodsb\t$src", (LODSB srcidx8:$src), 0>; +def : InstAlias<"lodsw\t$src", (LODSW srcidx16:$src), 0>; +def : InstAlias<"lods{l|d}\t$src", (LODSL srcidx32:$src), 0>; +def : InstAlias<"lodsq\t$src", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>; +def : InstAlias<"lods\t{$src, %al|al, $src}", (LODSB srcidx8:$src), 0>; +def : InstAlias<"lods\t{$src, %ax|ax, $src}", (LODSW srcidx16:$src), 0>; +def : InstAlias<"lods\t{$src, %eax|eax, $src}", (LODSL srcidx32:$src), 0>; +def : InstAlias<"lods\t{$src, %rax|rax, $src}", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>; +def : InstAlias<"lods\t$src", (LODSB srcidx8:$src), 0, "intel">; +def : InstAlias<"lods\t$src", (LODSW srcidx16:$src), 0, "intel">; +def : InstAlias<"lods\t$src", (LODSL srcidx32:$src), 0, "intel">; +def : InstAlias<"lods\t$src", (LODSQ srcidx64:$src), 0, "intel">, Requires<[In64BitMode]>; + + +// stos aliases. Accept the source being omitted because it's implicit in +// the mnemonic, or the mnemonic suffix being omitted because it's implicit +// in the source. +def : InstAlias<"stosb\t$dst", (STOSB dstidx8:$dst), 0>; +def : InstAlias<"stosw\t$dst", (STOSW dstidx16:$dst), 0>; +def : InstAlias<"stos{l|d}\t$dst", (STOSL dstidx32:$dst), 0>; +def : InstAlias<"stosq\t$dst", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>; +def : InstAlias<"stos\t{%al, $dst|$dst, al}", (STOSB dstidx8:$dst), 0>; +def : InstAlias<"stos\t{%ax, $dst|$dst, ax}", (STOSW dstidx16:$dst), 0>; +def : InstAlias<"stos\t{%eax, $dst|$dst, eax}", (STOSL dstidx32:$dst), 0>; +def : InstAlias<"stos\t{%rax, $dst|$dst, rax}", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>; +def : InstAlias<"stos\t$dst", (STOSB dstidx8:$dst), 0, "intel">; +def : InstAlias<"stos\t$dst", (STOSW dstidx16:$dst), 0, "intel">; +def : InstAlias<"stos\t$dst", (STOSL dstidx32:$dst), 0, "intel">; +def : InstAlias<"stos\t$dst", (STOSQ dstidx64:$dst), 0, "intel">, Requires<[In64BitMode]>; + + +// scas aliases. Accept the destination being omitted because it's implicit +// in the mnemonic, or the mnemonic suffix being omitted because it's implicit +// in the destination. +def : InstAlias<"scasb\t$dst", (SCASB dstidx8:$dst), 0>; +def : InstAlias<"scasw\t$dst", (SCASW dstidx16:$dst), 0>; +def : InstAlias<"scas{l|d}\t$dst", (SCASL dstidx32:$dst), 0>; +def : InstAlias<"scasq\t$dst", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>; +def : InstAlias<"scas\t{$dst, %al|al, $dst}", (SCASB dstidx8:$dst), 0>; +def : InstAlias<"scas\t{$dst, %ax|ax, $dst}", (SCASW dstidx16:$dst), 0>; +def : InstAlias<"scas\t{$dst, %eax|eax, $dst}", (SCASL dstidx32:$dst), 0>; +def : InstAlias<"scas\t{$dst, %rax|rax, $dst}", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>; +def : InstAlias<"scas\t$dst", (SCASB dstidx8:$dst), 0, "intel">; +def : InstAlias<"scas\t$dst", (SCASW dstidx16:$dst), 0, "intel">; +def : InstAlias<"scas\t$dst", (SCASL dstidx32:$dst), 0, "intel">; +def : InstAlias<"scas\t$dst", (SCASQ dstidx64:$dst), 0, "intel">, Requires<[In64BitMode]>; + +// cmps aliases. Mnemonic suffix being omitted because it's implicit +// in the destination. +def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSB dstidx8:$dst, srcidx8:$src), 0, "intel">; +def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSW dstidx16:$dst, srcidx16:$src), 0, "intel">; +def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSL dstidx32:$dst, srcidx32:$src), 0, "intel">; +def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSQ dstidx64:$dst, srcidx64:$src), 0, "intel">, Requires<[In64BitMode]>; + +// movs aliases. Mnemonic suffix being omitted because it's implicit +// in the destination. +def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSB dstidx8:$dst, srcidx8:$src), 0, "intel">; +def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSW dstidx16:$dst, srcidx16:$src), 0, "intel">; +def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSL dstidx32:$dst, srcidx32:$src), 0, "intel">; +def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSQ dstidx64:$dst, srcidx64:$src), 0, "intel">, Requires<[In64BitMode]>; + +// div and idiv aliases for explicit A register. +def : InstAlias<"div{b}\t{$src, %al|al, $src}", (DIV8r GR8 :$src)>; +def : InstAlias<"div{w}\t{$src, %ax|ax, $src}", (DIV16r GR16:$src)>; +def : InstAlias<"div{l}\t{$src, %eax|eax, $src}", (DIV32r GR32:$src)>; +def : InstAlias<"div{q}\t{$src, %rax|rax, $src}", (DIV64r GR64:$src)>; +def : InstAlias<"div{b}\t{$src, %al|al, $src}", (DIV8m i8mem :$src)>; +def : InstAlias<"div{w}\t{$src, %ax|ax, $src}", (DIV16m i16mem:$src)>; +def : InstAlias<"div{l}\t{$src, %eax|eax, $src}", (DIV32m i32mem:$src)>; +def : InstAlias<"div{q}\t{$src, %rax|rax, $src}", (DIV64m i64mem:$src)>; +def : InstAlias<"idiv{b}\t{$src, %al|al, $src}", (IDIV8r GR8 :$src)>; +def : InstAlias<"idiv{w}\t{$src, %ax|ax, $src}", (IDIV16r GR16:$src)>; +def : InstAlias<"idiv{l}\t{$src, %eax|eax, $src}", (IDIV32r GR32:$src)>; +def : InstAlias<"idiv{q}\t{$src, %rax|rax, $src}", (IDIV64r GR64:$src)>; +def : InstAlias<"idiv{b}\t{$src, %al|al, $src}", (IDIV8m i8mem :$src)>; +def : InstAlias<"idiv{w}\t{$src, %ax|ax, $src}", (IDIV16m i16mem:$src)>; +def : InstAlias<"idiv{l}\t{$src, %eax|eax, $src}", (IDIV32m i32mem:$src)>; +def : InstAlias<"idiv{q}\t{$src, %rax|rax, $src}", (IDIV64m i64mem:$src)>; + + + +// Various unary fpstack operations default to operating on ST1. +// For example, "fxch" -> "fxch %st(1)" +def : InstAlias<"faddp", (ADD_FPrST0 ST1), 0>; +def: InstAlias<"fadd", (ADD_FPrST0 ST1), 0>; +def : InstAlias<"fsub{|r}p", (SUBR_FPrST0 ST1), 0>; +def : InstAlias<"fsub{r|}p", (SUB_FPrST0 ST1), 0>; +def : InstAlias<"fmul", (MUL_FPrST0 ST1), 0>; +def : InstAlias<"fmulp", (MUL_FPrST0 ST1), 0>; +def : InstAlias<"fdiv{|r}p", (DIVR_FPrST0 ST1), 0>; +def : InstAlias<"fdiv{r|}p", (DIV_FPrST0 ST1), 0>; +def : InstAlias<"fxch", (XCH_F ST1), 0>; +def : InstAlias<"fcom", (COM_FST0r ST1), 0>; +def : InstAlias<"fcomp", (COMP_FST0r ST1), 0>; +def : InstAlias<"fcomi", (COM_FIr ST1), 0>; +def : InstAlias<"fcompi", (COM_FIPr ST1), 0>; +def : InstAlias<"fucom", (UCOM_Fr ST1), 0>; +def : InstAlias<"fucomp", (UCOM_FPr ST1), 0>; +def : InstAlias<"fucomi", (UCOM_FIr ST1), 0>; +def : InstAlias<"fucompi", (UCOM_FIPr ST1), 0>; + +// Handle fmul/fadd/fsub/fdiv instructions with explicitly written st(0) op. +// For example, "fadd %st(4), %st(0)" -> "fadd %st(4)". We also disambiguate +// instructions like "fadd %st(0), %st(0)" as "fadd %st(0)" for consistency with +// gas. +multiclass FpUnaryAlias<string Mnemonic, Instruction Inst, bit EmitAlias = 1> { + def : InstAlias<!strconcat(Mnemonic, "\t{$op, %st(0)|st(0), $op}"), + (Inst RST:$op), EmitAlias>; + def : InstAlias<!strconcat(Mnemonic, "\t{%st(0), %st(0)|st(0), st(0)}"), + (Inst ST0), EmitAlias>; +} + +defm : FpUnaryAlias<"fadd", ADD_FST0r>; +defm : FpUnaryAlias<"faddp", ADD_FPrST0, 0>; +defm : FpUnaryAlias<"fsub", SUB_FST0r>; +defm : FpUnaryAlias<"fsub{|r}p", SUBR_FPrST0>; +defm : FpUnaryAlias<"fsubr", SUBR_FST0r>; +defm : FpUnaryAlias<"fsub{r|}p", SUB_FPrST0>; +defm : FpUnaryAlias<"fmul", MUL_FST0r>; +defm : FpUnaryAlias<"fmulp", MUL_FPrST0>; +defm : FpUnaryAlias<"fdiv", DIV_FST0r>; +defm : FpUnaryAlias<"fdiv{|r}p", DIVR_FPrST0>; +defm : FpUnaryAlias<"fdivr", DIVR_FST0r>; +defm : FpUnaryAlias<"fdiv{r|}p", DIV_FPrST0>; +defm : FpUnaryAlias<"fcomi", COM_FIr, 0>; +defm : FpUnaryAlias<"fucomi", UCOM_FIr, 0>; +defm : FpUnaryAlias<"fcompi", COM_FIPr>; +defm : FpUnaryAlias<"fucompi", UCOM_FIPr>; + + +// Handle "f{mulp,addp} st(0), $op" the same as "f{mulp,addp} $op", since they +// commute. We also allow fdiv[r]p/fsubrp even though they don't commute, +// solely because gas supports it. +def : InstAlias<"faddp\t{%st(0), $op|$op, st(0)}", (ADD_FPrST0 RST:$op), 0>; +def : InstAlias<"fmulp\t{%st(0), $op|$op, st(0)}", (MUL_FPrST0 RST:$op)>; +def : InstAlias<"fsub{|r}p\t{%st(0), $op|$op, st(0)}", (SUBR_FPrST0 RST:$op)>; +def : InstAlias<"fsub{r|}p\t{%st(0), $op|$op, st(0)}", (SUB_FPrST0 RST:$op)>; +def : InstAlias<"fdiv{|r}p\t{%st(0), $op|$op, st(0)}", (DIVR_FPrST0 RST:$op)>; +def : InstAlias<"fdiv{r|}p\t{%st(0), $op|$op, st(0)}", (DIV_FPrST0 RST:$op)>; + +def : InstAlias<"fnstsw" , (FNSTSW16r), 0>; + +// lcall and ljmp aliases. This seems to be an odd mapping in 64-bit mode, but +// this is compatible with what GAS does. +def : InstAlias<"lcall\t$seg : $off", (FARCALL32i i32imm:$off, i16imm:$seg), 0>, Requires<[In32BitMode]>; +def : InstAlias<"ljmp\t$seg : $off", (FARJMP32i i32imm:$off, i16imm:$seg), 0>, Requires<[In32BitMode]>; +def : InstAlias<"lcall\t{*}$dst", (FARCALL32m opaquemem:$dst), 0>, Requires<[Not16BitMode]>; +def : InstAlias<"ljmp\t{*}$dst", (FARJMP32m opaquemem:$dst), 0>, Requires<[Not16BitMode]>; +def : InstAlias<"lcall\t$seg : $off", (FARCALL16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>; +def : InstAlias<"ljmp\t$seg : $off", (FARJMP16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>; +def : InstAlias<"lcall\t{*}$dst", (FARCALL16m opaquemem:$dst), 0>, Requires<[In16BitMode]>; +def : InstAlias<"ljmp\t{*}$dst", (FARJMP16m opaquemem:$dst), 0>, Requires<[In16BitMode]>; + +def : InstAlias<"jmp\t{*}$dst", (JMP64m i64mem:$dst), 0, "att">, Requires<[In64BitMode]>; +def : InstAlias<"jmp\t{*}$dst", (JMP32m i32mem:$dst), 0, "att">, Requires<[In32BitMode]>; +def : InstAlias<"jmp\t{*}$dst", (JMP16m i16mem:$dst), 0, "att">, Requires<[In16BitMode]>; + + +// "imul <imm>, B" is an alias for "imul <imm>, B, B". +def : InstAlias<"imul{w}\t{$imm, $r|$r, $imm}", (IMUL16rri GR16:$r, GR16:$r, i16imm:$imm), 0>; +def : InstAlias<"imul{w}\t{$imm, $r|$r, $imm}", (IMUL16rri8 GR16:$r, GR16:$r, i16i8imm:$imm), 0>; +def : InstAlias<"imul{l}\t{$imm, $r|$r, $imm}", (IMUL32rri GR32:$r, GR32:$r, i32imm:$imm), 0>; +def : InstAlias<"imul{l}\t{$imm, $r|$r, $imm}", (IMUL32rri8 GR32:$r, GR32:$r, i32i8imm:$imm), 0>; +def : InstAlias<"imul{q}\t{$imm, $r|$r, $imm}", (IMUL64rri32 GR64:$r, GR64:$r, i64i32imm:$imm), 0>; +def : InstAlias<"imul{q}\t{$imm, $r|$r, $imm}", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm), 0>; + +// ins aliases. Accept the mnemonic suffix being omitted because it's implicit +// in the destination. +def : InstAlias<"ins\t{%dx, $dst|$dst, dx}", (INSB dstidx8:$dst), 0, "intel">; +def : InstAlias<"ins\t{%dx, $dst|$dst, dx}", (INSW dstidx16:$dst), 0, "intel">; +def : InstAlias<"ins\t{%dx, $dst|$dst, dx}", (INSL dstidx32:$dst), 0, "intel">; + +// outs aliases. Accept the mnemonic suffix being omitted because it's implicit +// in the source. +def : InstAlias<"outs\t{$src, %dx|dx, $src}", (OUTSB srcidx8:$src), 0, "intel">; +def : InstAlias<"outs\t{$src, %dx|dx, $src}", (OUTSW srcidx16:$src), 0, "intel">; +def : InstAlias<"outs\t{$src, %dx|dx, $src}", (OUTSL srcidx32:$src), 0, "intel">; + +// inb %dx -> inb %al, %dx +def : InstAlias<"inb\t{%dx|dx}", (IN8rr), 0>; +def : InstAlias<"inw\t{%dx|dx}", (IN16rr), 0>; +def : InstAlias<"inl\t{%dx|dx}", (IN32rr), 0>; +def : InstAlias<"inb\t$port", (IN8ri u8imm:$port), 0>; +def : InstAlias<"inw\t$port", (IN16ri u8imm:$port), 0>; +def : InstAlias<"inl\t$port", (IN32ri u8imm:$port), 0>; + + +// jmp and call aliases for lcall and ljmp. jmp $42,$5 -> ljmp +def : InstAlias<"call\t$seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>; +def : InstAlias<"jmp\t$seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>; +def : InstAlias<"call\t$seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>, Requires<[In32BitMode]>; +def : InstAlias<"jmp\t$seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>, Requires<[In32BitMode]>; +def : InstAlias<"callw\t$seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>, Requires<[Not64BitMode]>; +def : InstAlias<"jmpw\t$seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg)>, Requires<[Not64BitMode]>; +def : InstAlias<"calll\t$seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>, Requires<[Not64BitMode]>; +def : InstAlias<"jmpl\t$seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>, Requires<[Not64BitMode]>; + +// Match 'movq <largeimm>, <reg>' as an alias for movabsq. +def : InstAlias<"mov{q}\t{$imm, $reg|$reg, $imm}", (MOV64ri GR64:$reg, i64imm:$imm), 0>; + +// Match 'movd GR64, MMX' as an alias for movq to be compatible with gas, +// which supports this due to an old AMD documentation bug when 64-bit mode was +// created. +def : InstAlias<"movd\t{$src, $dst|$dst, $src}", + (MMX_MOVD64to64rr VR64:$dst, GR64:$src), 0>; +def : InstAlias<"movd\t{$src, $dst|$dst, $src}", + (MMX_MOVD64from64rr GR64:$dst, VR64:$src), 0>; + +// movsx aliases +def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX16rr8 GR16:$dst, GR8:$src), 0, "att">; +def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX16rm8 GR16:$dst, i8mem:$src), 0, "att">; +def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX32rr8 GR32:$dst, GR8:$src), 0, "att">; +def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX32rr16 GR32:$dst, GR16:$src), 0, "att">; +def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX64rr8 GR64:$dst, GR8:$src), 0, "att">; +def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX64rr16 GR64:$dst, GR16:$src), 0, "att">; +def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX64rr32 GR64:$dst, GR32:$src), 0, "att">; + +// movzx aliases +def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX16rr8 GR16:$dst, GR8:$src), 0, "att">; +def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX16rm8 GR16:$dst, i8mem:$src), 0, "att">; +def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX32rr8 GR32:$dst, GR8:$src), 0, "att">; +def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX32rr16 GR32:$dst, GR16:$src), 0, "att">; +def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX64rr8 GR64:$dst, GR8:$src), 0, "att">; +def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX64rr16 GR64:$dst, GR16:$src), 0, "att">; +// Note: No GR32->GR64 movzx form. + +// outb %dx -> outb %al, %dx +def : InstAlias<"outb\t{%dx|dx}", (OUT8rr), 0>; +def : InstAlias<"outw\t{%dx|dx}", (OUT16rr), 0>; +def : InstAlias<"outl\t{%dx|dx}", (OUT32rr), 0>; +def : InstAlias<"outb\t$port", (OUT8ir u8imm:$port), 0>; +def : InstAlias<"outw\t$port", (OUT16ir u8imm:$port), 0>; +def : InstAlias<"outl\t$port", (OUT32ir u8imm:$port), 0>; + +// 'sldt <mem>' can be encoded with either sldtw or sldtq with the same +// effect (both store to a 16-bit mem). Force to sldtw to avoid ambiguity +// errors, since its encoding is the most compact. +def : InstAlias<"sldt $mem", (SLDT16m i16mem:$mem), 0>; + +// shld/shrd op,op -> shld op, op, CL +def : InstAlias<"shld{w}\t{$r2, $r1|$r1, $r2}", (SHLD16rrCL GR16:$r1, GR16:$r2), 0>; +def : InstAlias<"shld{l}\t{$r2, $r1|$r1, $r2}", (SHLD32rrCL GR32:$r1, GR32:$r2), 0>; +def : InstAlias<"shld{q}\t{$r2, $r1|$r1, $r2}", (SHLD64rrCL GR64:$r1, GR64:$r2), 0>; +def : InstAlias<"shrd{w}\t{$r2, $r1|$r1, $r2}", (SHRD16rrCL GR16:$r1, GR16:$r2), 0>; +def : InstAlias<"shrd{l}\t{$r2, $r1|$r1, $r2}", (SHRD32rrCL GR32:$r1, GR32:$r2), 0>; +def : InstAlias<"shrd{q}\t{$r2, $r1|$r1, $r2}", (SHRD64rrCL GR64:$r1, GR64:$r2), 0>; + +def : InstAlias<"shld{w}\t{$reg, $mem|$mem, $reg}", (SHLD16mrCL i16mem:$mem, GR16:$reg), 0>; +def : InstAlias<"shld{l}\t{$reg, $mem|$mem, $reg}", (SHLD32mrCL i32mem:$mem, GR32:$reg), 0>; +def : InstAlias<"shld{q}\t{$reg, $mem|$mem, $reg}", (SHLD64mrCL i64mem:$mem, GR64:$reg), 0>; +def : InstAlias<"shrd{w}\t{$reg, $mem|$mem, $reg}", (SHRD16mrCL i16mem:$mem, GR16:$reg), 0>; +def : InstAlias<"shrd{l}\t{$reg, $mem|$mem, $reg}", (SHRD32mrCL i32mem:$mem, GR32:$reg), 0>; +def : InstAlias<"shrd{q}\t{$reg, $mem|$mem, $reg}", (SHRD64mrCL i64mem:$mem, GR64:$reg), 0>; + +/* FIXME: This is disabled because the asm matcher is currently incapable of + * matching a fixed immediate like $1. +// "shl X, $1" is an alias for "shl X". +multiclass ShiftRotateByOneAlias<string Mnemonic, string Opc> { + def : InstAlias<!strconcat(Mnemonic, "b $op, $$1"), + (!cast<Instruction>(!strconcat(Opc, "8r1")) GR8:$op)>; + def : InstAlias<!strconcat(Mnemonic, "w $op, $$1"), + (!cast<Instruction>(!strconcat(Opc, "16r1")) GR16:$op)>; + def : InstAlias<!strconcat(Mnemonic, "l $op, $$1"), + (!cast<Instruction>(!strconcat(Opc, "32r1")) GR32:$op)>; + def : InstAlias<!strconcat(Mnemonic, "q $op, $$1"), + (!cast<Instruction>(!strconcat(Opc, "64r1")) GR64:$op)>; + def : InstAlias<!strconcat(Mnemonic, "b $op, $$1"), + (!cast<Instruction>(!strconcat(Opc, "8m1")) i8mem:$op)>; + def : InstAlias<!strconcat(Mnemonic, "w $op, $$1"), + (!cast<Instruction>(!strconcat(Opc, "16m1")) i16mem:$op)>; + def : InstAlias<!strconcat(Mnemonic, "l $op, $$1"), + (!cast<Instruction>(!strconcat(Opc, "32m1")) i32mem:$op)>; + def : InstAlias<!strconcat(Mnemonic, "q $op, $$1"), + (!cast<Instruction>(!strconcat(Opc, "64m1")) i64mem:$op)>; +} + +defm : ShiftRotateByOneAlias<"rcl", "RCL">; +defm : ShiftRotateByOneAlias<"rcr", "RCR">; +defm : ShiftRotateByOneAlias<"rol", "ROL">; +defm : ShiftRotateByOneAlias<"ror", "ROR">; +FIXME */ + +// test: We accept "testX <reg>, <mem>" and "testX <mem>, <reg>" as synonyms. +def : InstAlias<"test{b}\t{$mem, $val|$val, $mem}", + (TEST8mr i8mem :$mem, GR8 :$val), 0>; +def : InstAlias<"test{w}\t{$mem, $val|$val, $mem}", + (TEST16mr i16mem:$mem, GR16:$val), 0>; +def : InstAlias<"test{l}\t{$mem, $val|$val, $mem}", + (TEST32mr i32mem:$mem, GR32:$val), 0>; +def : InstAlias<"test{q}\t{$mem, $val|$val, $mem}", + (TEST64mr i64mem:$mem, GR64:$val), 0>; + +// xchg: We accept "xchgX <reg>, <mem>" and "xchgX <mem>, <reg>" as synonyms. +def : InstAlias<"xchg{b}\t{$mem, $val|$val, $mem}", + (XCHG8rm GR8 :$val, i8mem :$mem), 0>; +def : InstAlias<"xchg{w}\t{$mem, $val|$val, $mem}", + (XCHG16rm GR16:$val, i16mem:$mem), 0>; +def : InstAlias<"xchg{l}\t{$mem, $val|$val, $mem}", + (XCHG32rm GR32:$val, i32mem:$mem), 0>; +def : InstAlias<"xchg{q}\t{$mem, $val|$val, $mem}", + (XCHG64rm GR64:$val, i64mem:$mem), 0>; + +// xchg: We accept "xchgX <reg>, %eax" and "xchgX %eax, <reg>" as synonyms. +def : InstAlias<"xchg{w}\t{%ax, $src|$src, ax}", (XCHG16ar GR16:$src), 0>; +def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}", (XCHG32ar GR32:$src), 0>; +def : InstAlias<"xchg{q}\t{%rax, $src|$src, rax}", (XCHG64ar GR64:$src), 0>; + +// In 64-bit mode, xchg %eax, %eax can't be encoded with the 0x90 opcode we +// would get by default because it's defined as NOP. But xchg %eax, %eax implies +// implicit zeroing of the upper 32 bits. So alias to the longer encoding. +def : InstAlias<"xchg{l}\t{%eax, %eax|eax, eax}", + (XCHG32rr EAX, EAX), 0>, Requires<[In64BitMode]>; + +// xchg %rax, %rax is a nop in x86-64 and can be encoded as such. Without this +// we emit an unneeded REX.w prefix. +def : InstAlias<"xchg{q}\t{%rax, %rax|rax, rax}", (NOOP), 0>; + +// These aliases exist to get the parser to prioritize matching 8-bit +// immediate encodings over matching the implicit ax/eax/rax encodings. By +// explicitly mentioning the A register here, these entries will be ordered +// first due to the more explicit immediate type. +def : InstAlias<"adc{w}\t{$imm, %ax|ax, $imm}", (ADC16ri8 AX, i16i8imm:$imm), 0>; +def : InstAlias<"add{w}\t{$imm, %ax|ax, $imm}", (ADD16ri8 AX, i16i8imm:$imm), 0>; +def : InstAlias<"and{w}\t{$imm, %ax|ax, $imm}", (AND16ri8 AX, i16i8imm:$imm), 0>; +def : InstAlias<"cmp{w}\t{$imm, %ax|ax, $imm}", (CMP16ri8 AX, i16i8imm:$imm), 0>; +def : InstAlias<"or{w}\t{$imm, %ax|ax, $imm}", (OR16ri8 AX, i16i8imm:$imm), 0>; +def : InstAlias<"sbb{w}\t{$imm, %ax|ax, $imm}", (SBB16ri8 AX, i16i8imm:$imm), 0>; +def : InstAlias<"sub{w}\t{$imm, %ax|ax, $imm}", (SUB16ri8 AX, i16i8imm:$imm), 0>; +def : InstAlias<"xor{w}\t{$imm, %ax|ax, $imm}", (XOR16ri8 AX, i16i8imm:$imm), 0>; + +def : InstAlias<"adc{l}\t{$imm, %eax|eax, $imm}", (ADC32ri8 EAX, i32i8imm:$imm), 0>; +def : InstAlias<"add{l}\t{$imm, %eax|eax, $imm}", (ADD32ri8 EAX, i32i8imm:$imm), 0>; +def : InstAlias<"and{l}\t{$imm, %eax|eax, $imm}", (AND32ri8 EAX, i32i8imm:$imm), 0>; +def : InstAlias<"cmp{l}\t{$imm, %eax|eax, $imm}", (CMP32ri8 EAX, i32i8imm:$imm), 0>; +def : InstAlias<"or{l}\t{$imm, %eax|eax, $imm}", (OR32ri8 EAX, i32i8imm:$imm), 0>; +def : InstAlias<"sbb{l}\t{$imm, %eax|eax, $imm}", (SBB32ri8 EAX, i32i8imm:$imm), 0>; +def : InstAlias<"sub{l}\t{$imm, %eax|eax, $imm}", (SUB32ri8 EAX, i32i8imm:$imm), 0>; +def : InstAlias<"xor{l}\t{$imm, %eax|eax, $imm}", (XOR32ri8 EAX, i32i8imm:$imm), 0>; + +def : InstAlias<"adc{q}\t{$imm, %rax|rax, $imm}", (ADC64ri8 RAX, i64i8imm:$imm), 0>; +def : InstAlias<"add{q}\t{$imm, %rax|rax, $imm}", (ADD64ri8 RAX, i64i8imm:$imm), 0>; +def : InstAlias<"and{q}\t{$imm, %rax|rax, $imm}", (AND64ri8 RAX, i64i8imm:$imm), 0>; +def : InstAlias<"cmp{q}\t{$imm, %rax|rax, $imm}", (CMP64ri8 RAX, i64i8imm:$imm), 0>; +def : InstAlias<"or{q}\t{$imm, %rax|rax, $imm}", (OR64ri8 RAX, i64i8imm:$imm), 0>; +def : InstAlias<"sbb{q}\t{$imm, %rax|rax, $imm}", (SBB64ri8 RAX, i64i8imm:$imm), 0>; +def : InstAlias<"sub{q}\t{$imm, %rax|rax, $imm}", (SUB64ri8 RAX, i64i8imm:$imm), 0>; +def : InstAlias<"xor{q}\t{$imm, %rax|rax, $imm}", (XOR64ri8 RAX, i64i8imm:$imm), 0>; diff --git a/capstone/suite/synctools/tablegen/X86/back/X86InstrInfo_reduce.td b/capstone/suite/synctools/tablegen/X86/back/X86InstrInfo_reduce.td new file mode 100644 index 000000000..9aa8425b1 --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86InstrInfo_reduce.td @@ -0,0 +1,3572 @@ +//===-- X86InstrInfo.td - Main X86 Instruction Definition --*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 instruction set, defining the instructions, and +// properties of the instructions which are needed for code generation, machine +// code emission, and analysis. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// X86 specific DAG Nodes. +// + +def SDTIntShiftDOp: SDTypeProfile<1, 3, + [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, + SDTCisInt<0>, SDTCisInt<3>]>; + +def SDTX86CmpTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisSameAs<1, 2>]>; + +def SDTX86Cmps : SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; +//def SDTX86Cmpss : SDTypeProfile<1, 3, [SDTCisVT<0, f32>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; + +def SDTX86Cmov : SDTypeProfile<1, 4, + [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, + SDTCisVT<3, i8>, SDTCisVT<4, i32>]>; + +// Unary and binary operator instructions that set EFLAGS as a side-effect. +def SDTUnaryArithWithFlags : SDTypeProfile<2, 1, + [SDTCisSameAs<0, 2>, + SDTCisInt<0>, SDTCisVT<1, i32>]>; + +def SDTBinaryArithWithFlags : SDTypeProfile<2, 2, + [SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>, + SDTCisInt<0>, SDTCisVT<1, i32>]>; + +// SDTBinaryArithWithFlagsInOut - RES1, EFLAGS = op LHS, RHS, EFLAGS +def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3, + [SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>, + SDTCisInt<0>, + SDTCisVT<1, i32>, + SDTCisVT<4, i32>]>; +// RES1, RES2, FLAGS = op LHS, RHS +def SDT2ResultBinaryArithWithFlags : SDTypeProfile<3, 2, + [SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>, + SDTCisInt<0>, SDTCisVT<1, i32>]>; +def SDTX86BrCond : SDTypeProfile<0, 3, + [SDTCisVT<0, OtherVT>, + SDTCisVT<1, i8>, SDTCisVT<2, i32>]>; + +def SDTX86SetCC : SDTypeProfile<1, 2, + [SDTCisVT<0, i8>, + SDTCisVT<1, i8>, SDTCisVT<2, i32>]>; +def SDTX86SetCC_C : SDTypeProfile<1, 2, + [SDTCisInt<0>, + SDTCisVT<1, i8>, SDTCisVT<2, i32>]>; + +def SDTX86sahf : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVT<1, i8>]>; + +def SDTX86rdrand : SDTypeProfile<2, 0, [SDTCisInt<0>, SDTCisVT<1, i32>]>; + +def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>, + SDTCisVT<2, i8>]>; +def SDTX86caspair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; +def SDTX86caspairSaveEbx8 : SDTypeProfile<1, 3, + [SDTCisVT<0, i32>, SDTCisPtrTy<1>, + SDTCisVT<2, i32>, SDTCisVT<3, i32>]>; +def SDTX86caspairSaveRbx16 : SDTypeProfile<1, 3, + [SDTCisVT<0, i64>, SDTCisPtrTy<1>, + SDTCisVT<2, i64>, SDTCisVT<3, i64>]>; + +def SDTLockBinaryArithWithFlags : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, + SDTCisPtrTy<1>, + SDTCisInt<2>]>; + +def SDTLockUnaryArithWithFlags : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, + SDTCisPtrTy<1>]>; + +def SDTX86Ret : SDTypeProfile<0, -1, [SDTCisVT<0, i32>]>; + +def SDT_X86CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>, + SDTCisVT<1, i32>]>; +def SDT_X86CallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, + SDTCisVT<1, i32>]>; + +def SDT_X86Call : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>; + +def SDT_X86NtBrind : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>; + +def SDT_X86VASTART_SAVE_XMM_REGS : SDTypeProfile<0, -1, [SDTCisVT<0, i8>, + SDTCisVT<1, iPTR>, + SDTCisVT<2, iPTR>]>; + +def SDT_X86VAARG_64 : SDTypeProfile<1, -1, [SDTCisPtrTy<0>, + SDTCisPtrTy<1>, + SDTCisVT<2, i32>, + SDTCisVT<3, i8>, + SDTCisVT<4, i32>]>; + +def SDTX86RepStr : SDTypeProfile<0, 1, [SDTCisVT<0, OtherVT>]>; + +def SDTX86Void : SDTypeProfile<0, 0, []>; + +def SDTX86Wrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>; + +def SDT_X86TLSADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>; + +def SDT_X86TLSBASEADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>; + +def SDT_X86TLSCALL : SDTypeProfile<0, 1, [SDTCisInt<0>]>; + +def SDT_X86WIN_ALLOCA : SDTypeProfile<0, 1, [SDTCisVT<0, iPTR>]>; + +def SDT_X86SEG_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>; + +def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>; + +def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>; + +def SDT_X86MEMBARRIER : SDTypeProfile<0, 0, []>; + +def X86MemBarrier : SDNode<"X86ISD::MEMBARRIER", SDT_X86MEMBARRIER, + [SDNPHasChain,SDNPSideEffect]>; +def X86MFence : SDNode<"X86ISD::MFENCE", SDT_X86MEMBARRIER, + [SDNPHasChain]>; + + +def X86bsf : SDNode<"X86ISD::BSF", SDTUnaryArithWithFlags>; +def X86bsr : SDNode<"X86ISD::BSR", SDTUnaryArithWithFlags>; +def X86shld : SDNode<"X86ISD::SHLD", SDTIntShiftDOp>; +def X86shrd : SDNode<"X86ISD::SHRD", SDTIntShiftDOp>; + +def X86cmp : SDNode<"X86ISD::CMP" , SDTX86CmpTest>; +def X86bt : SDNode<"X86ISD::BT", SDTX86CmpTest>; + +def X86cmov : SDNode<"X86ISD::CMOV", SDTX86Cmov>; +def X86brcond : SDNode<"X86ISD::BRCOND", SDTX86BrCond, + [SDNPHasChain]>; +def X86setcc : SDNode<"X86ISD::SETCC", SDTX86SetCC>; +def X86setcc_c : SDNode<"X86ISD::SETCC_CARRY", SDTX86SetCC_C>; + +def X86sahf : SDNode<"X86ISD::SAHF", SDTX86sahf>; + +def X86rdrand : SDNode<"X86ISD::RDRAND", SDTX86rdrand, + [SDNPHasChain, SDNPSideEffect]>; + +def X86rdseed : SDNode<"X86ISD::RDSEED", SDTX86rdrand, + [SDNPHasChain, SDNPSideEffect]>; + +def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, + SDNPMayLoad, SDNPMemOperand]>; +def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86caspair, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, + SDNPMayLoad, SDNPMemOperand]>; +def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86caspair, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, + SDNPMayLoad, SDNPMemOperand]>; +def X86cas8save_ebx : SDNode<"X86ISD::LCMPXCHG8_SAVE_EBX_DAG", + SDTX86caspairSaveEbx8, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, + SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; +def X86cas16save_rbx : SDNode<"X86ISD::LCMPXCHG16_SAVE_RBX_DAG", + SDTX86caspairSaveRbx16, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, + SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; + +def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret, + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; +def X86iret : SDNode<"X86ISD::IRET", SDTX86Ret, + [SDNPHasChain, SDNPOptInGlue]>; + +def X86vastart_save_xmm_regs : + SDNode<"X86ISD::VASTART_SAVE_XMM_REGS", + SDT_X86VASTART_SAVE_XMM_REGS, + [SDNPHasChain, SDNPVariadic]>; +def X86vaarg64 : + SDNode<"X86ISD::VAARG_64", SDT_X86VAARG_64, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, + SDNPMemOperand]>; +def X86callseq_start : + SDNode<"ISD::CALLSEQ_START", SDT_X86CallSeqStart, + [SDNPHasChain, SDNPOutGlue]>; +def X86callseq_end : + SDNode<"ISD::CALLSEQ_END", SDT_X86CallSeqEnd, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +def X86call : SDNode<"X86ISD::CALL", SDT_X86Call, + [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue, + SDNPVariadic]>; + +def X86NoTrackCall : SDNode<"X86ISD::NT_CALL", SDT_X86Call, + [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue, + SDNPVariadic]>; +def X86NoTrackBrind : SDNode<"X86ISD::NT_BRIND", SDT_X86NtBrind, + [SDNPHasChain]>; + +def X86rep_stos: SDNode<"X86ISD::REP_STOS", SDTX86RepStr, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore]>; +def X86rep_movs: SDNode<"X86ISD::REP_MOVS", SDTX86RepStr, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore, + SDNPMayLoad]>; + +def X86rdtsc : SDNode<"X86ISD::RDTSC_DAG", SDTX86Void, + [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; +def X86rdtscp : SDNode<"X86ISD::RDTSCP_DAG", SDTX86Void, + [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; +def X86rdpmc : SDNode<"X86ISD::RDPMC_DAG", SDTX86Void, + [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>; + +def X86Wrapper : SDNode<"X86ISD::Wrapper", SDTX86Wrapper>; +def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP", SDTX86Wrapper>; + +def X86RecoverFrameAlloc : SDNode<"ISD::LOCAL_RECOVER", + SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, + SDTCisInt<1>]>>; + +def X86tlsaddr : SDNode<"X86ISD::TLSADDR", SDT_X86TLSADDR, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +def X86tlsbaseaddr : SDNode<"X86ISD::TLSBASEADDR", SDT_X86TLSBASEADDR, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +def X86ehret : SDNode<"X86ISD::EH_RETURN", SDT_X86EHRET, + [SDNPHasChain]>; + +def X86eh_sjlj_setjmp : SDNode<"X86ISD::EH_SJLJ_SETJMP", + SDTypeProfile<1, 1, [SDTCisInt<0>, + SDTCisPtrTy<1>]>, + [SDNPHasChain, SDNPSideEffect]>; +def X86eh_sjlj_longjmp : SDNode<"X86ISD::EH_SJLJ_LONGJMP", + SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>, + [SDNPHasChain, SDNPSideEffect]>; +def X86eh_sjlj_setup_dispatch : SDNode<"X86ISD::EH_SJLJ_SETUP_DISPATCH", + SDTypeProfile<0, 0, []>, + [SDNPHasChain, SDNPSideEffect]>; + +def X86tcret : SDNode<"X86ISD::TC_RETURN", SDT_X86TCRET, + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; + +def X86add_flag : SDNode<"X86ISD::ADD", SDTBinaryArithWithFlags, + [SDNPCommutative]>; +def X86sub_flag : SDNode<"X86ISD::SUB", SDTBinaryArithWithFlags>; +def X86smul_flag : SDNode<"X86ISD::SMUL", SDTBinaryArithWithFlags, + [SDNPCommutative]>; +def X86umul_flag : SDNode<"X86ISD::UMUL", SDT2ResultBinaryArithWithFlags, + [SDNPCommutative]>; +def X86adc_flag : SDNode<"X86ISD::ADC", SDTBinaryArithWithFlagsInOut>; +def X86sbb_flag : SDNode<"X86ISD::SBB", SDTBinaryArithWithFlagsInOut>; + +def X86inc_flag : SDNode<"X86ISD::INC", SDTUnaryArithWithFlags>; +def X86dec_flag : SDNode<"X86ISD::DEC", SDTUnaryArithWithFlags>; +def X86or_flag : SDNode<"X86ISD::OR", SDTBinaryArithWithFlags, + [SDNPCommutative]>; +def X86xor_flag : SDNode<"X86ISD::XOR", SDTBinaryArithWithFlags, + [SDNPCommutative]>; +def X86and_flag : SDNode<"X86ISD::AND", SDTBinaryArithWithFlags, + [SDNPCommutative]>; + +def X86lock_add : SDNode<"X86ISD::LADD", SDTLockBinaryArithWithFlags, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, + SDNPMemOperand]>; +def X86lock_sub : SDNode<"X86ISD::LSUB", SDTLockBinaryArithWithFlags, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, + SDNPMemOperand]>; +def X86lock_or : SDNode<"X86ISD::LOR", SDTLockBinaryArithWithFlags, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, + SDNPMemOperand]>; +def X86lock_xor : SDNode<"X86ISD::LXOR", SDTLockBinaryArithWithFlags, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, + SDNPMemOperand]>; +def X86lock_and : SDNode<"X86ISD::LAND", SDTLockBinaryArithWithFlags, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, + SDNPMemOperand]>; + +def X86lock_inc : SDNode<"X86ISD::LINC", SDTLockUnaryArithWithFlags, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, + SDNPMemOperand]>; +def X86lock_dec : SDNode<"X86ISD::LDEC", SDTLockUnaryArithWithFlags, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, + SDNPMemOperand]>; + +def X86bextr : SDNode<"X86ISD::BEXTR", SDTIntBinOp>; + +def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>; + +def X86WinAlloca : SDNode<"X86ISD::WIN_ALLOCA", SDT_X86WIN_ALLOCA, + [SDNPHasChain, SDNPOutGlue]>; + +def X86SegAlloca : SDNode<"X86ISD::SEG_ALLOCA", SDT_X86SEG_ALLOCA, + [SDNPHasChain]>; + +def X86TLSCall : SDNode<"X86ISD::TLSCALL", SDT_X86TLSCALL, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +def X86lwpins : SDNode<"X86ISD::LWPINS", + SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisInt<1>, + SDTCisVT<2, i32>, SDTCisVT<3, i32>]>, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPSideEffect]>; + +def X86umwait : SDNode<"X86ISD::UMWAIT", + SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisInt<1>, + SDTCisVT<2, i32>, SDTCisVT<3, i32>]>, + [SDNPHasChain, SDNPSideEffect]>; + +def X86tpause : SDNode<"X86ISD::TPAUSE", + SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisInt<1>, + SDTCisVT<2, i32>, SDTCisVT<3, i32>]>, + [SDNPHasChain, SDNPSideEffect]>; + +//===----------------------------------------------------------------------===// +// X86 Operand Definitions. +// + +// A version of ptr_rc which excludes SP, ESP, and RSP. This is used for +// the index operand of an address, to conform to x86 encoding restrictions. +def ptr_rc_nosp : PointerLikeRegClass<1>; + +// *mem - Operand definitions for the funky X86 addressing mode operands. +// +def X86MemAsmOperand : AsmOperandClass { + let Name = "Mem"; +} +let RenderMethod = "addMemOperands", SuperClasses = [X86MemAsmOperand] in { + def X86Mem8AsmOperand : AsmOperandClass { let Name = "Mem8"; } + def X86Mem16AsmOperand : AsmOperandClass { let Name = "Mem16"; } + def X86Mem32AsmOperand : AsmOperandClass { let Name = "Mem32"; } + def X86Mem64AsmOperand : AsmOperandClass { let Name = "Mem64"; } + def X86Mem80AsmOperand : AsmOperandClass { let Name = "Mem80"; } + def X86Mem128AsmOperand : AsmOperandClass { let Name = "Mem128"; } + def X86Mem256AsmOperand : AsmOperandClass { let Name = "Mem256"; } + def X86Mem512AsmOperand : AsmOperandClass { let Name = "Mem512"; } + // Gather mem operands + def X86Mem64_RC128Operand : AsmOperandClass { let Name = "Mem64_RC128"; } + def X86Mem128_RC128Operand : AsmOperandClass { let Name = "Mem128_RC128"; } + def X86Mem256_RC128Operand : AsmOperandClass { let Name = "Mem256_RC128"; } + def X86Mem128_RC256Operand : AsmOperandClass { let Name = "Mem128_RC256"; } + def X86Mem256_RC256Operand : AsmOperandClass { let Name = "Mem256_RC256"; } + + def X86Mem64_RC128XOperand : AsmOperandClass { let Name = "Mem64_RC128X"; } + def X86Mem128_RC128XOperand : AsmOperandClass { let Name = "Mem128_RC128X"; } + def X86Mem256_RC128XOperand : AsmOperandClass { let Name = "Mem256_RC128X"; } + def X86Mem128_RC256XOperand : AsmOperandClass { let Name = "Mem128_RC256X"; } + def X86Mem256_RC256XOperand : AsmOperandClass { let Name = "Mem256_RC256X"; } + def X86Mem512_RC256XOperand : AsmOperandClass { let Name = "Mem512_RC256X"; } + def X86Mem256_RC512Operand : AsmOperandClass { let Name = "Mem256_RC512"; } + def X86Mem512_RC512Operand : AsmOperandClass { let Name = "Mem512_RC512"; } +} + +def X86AbsMemAsmOperand : AsmOperandClass { + let Name = "AbsMem"; + let SuperClasses = [X86MemAsmOperand]; +} + +class X86MemOperand<string printMethod, + AsmOperandClass parserMatchClass = X86MemAsmOperand> : Operand<iPTR> { + let PrintMethod = printMethod; + let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, SEGMENT_REG); + let ParserMatchClass = parserMatchClass; + let OperandType = "OPERAND_MEMORY"; +} + +// Gather mem operands +class X86VMemOperand<RegisterClass RC, string printMethod, + AsmOperandClass parserMatchClass> + : X86MemOperand<printMethod, parserMatchClass> { + let MIOperandInfo = (ops ptr_rc, i8imm, RC, i32imm, SEGMENT_REG); +} + +def anymem : X86MemOperand<"printanymem">; + +// FIXME: Right now we allow any size during parsing, but we might want to +// restrict to only unsized memory. +def opaquemem : X86MemOperand<"printopaquemem">; + +def i8mem : X86MemOperand<"printi8mem", X86Mem8AsmOperand>; +def i16mem : X86MemOperand<"printi16mem", X86Mem16AsmOperand>; +def i32mem : X86MemOperand<"printi32mem", X86Mem32AsmOperand>; +def i64mem : X86MemOperand<"printi64mem", X86Mem64AsmOperand>; +def i128mem : X86MemOperand<"printi128mem", X86Mem128AsmOperand>; +def i256mem : X86MemOperand<"printi256mem", X86Mem256AsmOperand>; +def i512mem : X86MemOperand<"printi512mem", X86Mem512AsmOperand>; +def f32mem : X86MemOperand<"printf32mem", X86Mem32AsmOperand>; +def f64mem : X86MemOperand<"printf64mem", X86Mem64AsmOperand>; +def f80mem : X86MemOperand<"printf80mem", X86Mem80AsmOperand>; +def f128mem : X86MemOperand<"printf128mem", X86Mem128AsmOperand>; +def f256mem : X86MemOperand<"printf256mem", X86Mem256AsmOperand>; +def f512mem : X86MemOperand<"printf512mem", X86Mem512AsmOperand>; + +def v512mem : X86VMemOperand<VR512, "printf512mem", X86Mem512AsmOperand>; + +// Gather mem operands +def vx64mem : X86VMemOperand<VR128, "printi64mem", X86Mem64_RC128Operand>; +def vx128mem : X86VMemOperand<VR128, "printi128mem", X86Mem128_RC128Operand>; +def vx256mem : X86VMemOperand<VR128, "printi256mem", X86Mem256_RC128Operand>; +def vy128mem : X86VMemOperand<VR256, "printi128mem", X86Mem128_RC256Operand>; +def vy256mem : X86VMemOperand<VR256, "printi256mem", X86Mem256_RC256Operand>; + +def vx64xmem : X86VMemOperand<VR128X, "printi64mem", X86Mem64_RC128XOperand>; +def vx128xmem : X86VMemOperand<VR128X, "printi128mem", X86Mem128_RC128XOperand>; +def vx256xmem : X86VMemOperand<VR128X, "printi256mem", X86Mem256_RC128XOperand>; +def vy128xmem : X86VMemOperand<VR256X, "printi128mem", X86Mem128_RC256XOperand>; +def vy256xmem : X86VMemOperand<VR256X, "printi256mem", X86Mem256_RC256XOperand>; +def vy512xmem : X86VMemOperand<VR256X, "printi512mem", X86Mem512_RC256XOperand>; +def vz256mem : X86VMemOperand<VR512, "printi256mem", X86Mem256_RC512Operand>; +def vz512mem : X86VMemOperand<VR512, "printi512mem", X86Mem512_RC512Operand>; + +// A version of i8mem for use on x86-64 and x32 that uses a NOREX GPR instead +// of a plain GPR, so that it doesn't potentially require a REX prefix. +def ptr_rc_norex : PointerLikeRegClass<2>; +def ptr_rc_norex_nosp : PointerLikeRegClass<3>; + +def i8mem_NOREX : Operand<iPTR> { + let PrintMethod = "printi8mem"; + let MIOperandInfo = (ops ptr_rc_norex, i8imm, ptr_rc_norex_nosp, i32imm, + SEGMENT_REG); + let ParserMatchClass = X86Mem8AsmOperand; + let OperandType = "OPERAND_MEMORY"; +} + +// GPRs available for tailcall. +// It represents GR32_TC, GR64_TC or GR64_TCW64. +def ptr_rc_tailcall : PointerLikeRegClass<4>; + +// Special i32mem for addresses of load folding tail calls. These are not +// allowed to use callee-saved registers since they must be scheduled +// after callee-saved register are popped. +def i32mem_TC : Operand<i32> { + let PrintMethod = "printi32mem"; + let MIOperandInfo = (ops ptr_rc_tailcall, i8imm, ptr_rc_tailcall, + i32imm, SEGMENT_REG); + let ParserMatchClass = X86Mem32AsmOperand; + let OperandType = "OPERAND_MEMORY"; +} + +// Special i64mem for addresses of load folding tail calls. These are not +// allowed to use callee-saved registers since they must be scheduled +// after callee-saved register are popped. +def i64mem_TC : Operand<i64> { + let PrintMethod = "printi64mem"; + let MIOperandInfo = (ops ptr_rc_tailcall, i8imm, + ptr_rc_tailcall, i32imm, SEGMENT_REG); + let ParserMatchClass = X86Mem64AsmOperand; + let OperandType = "OPERAND_MEMORY"; +} + +let OperandType = "OPERAND_PCREL", + ParserMatchClass = X86AbsMemAsmOperand, + PrintMethod = "printPCRelImm" in { +def i32imm_pcrel : Operand<i32>; +def i16imm_pcrel : Operand<i16>; + +// Branch targets have OtherVT type and print as pc-relative values. +def brtarget : Operand<OtherVT>; +def brtarget8 : Operand<OtherVT>; + +} + +// Special parser to detect 16-bit mode to select 16-bit displacement. +def X86AbsMem16AsmOperand : AsmOperandClass { + let Name = "AbsMem16"; + let RenderMethod = "addAbsMemOperands"; + let SuperClasses = [X86AbsMemAsmOperand]; +} + +// Branch targets have OtherVT type and print as pc-relative values. +let OperandType = "OPERAND_PCREL", + PrintMethod = "printPCRelImm" in { +let ParserMatchClass = X86AbsMem16AsmOperand in + def brtarget16 : Operand<OtherVT>; +let ParserMatchClass = X86AbsMemAsmOperand in + def brtarget32 : Operand<OtherVT>; +} + +let RenderMethod = "addSrcIdxOperands" in { + def X86SrcIdx8Operand : AsmOperandClass { + let Name = "SrcIdx8"; + let SuperClasses = [X86Mem8AsmOperand]; + } + def X86SrcIdx16Operand : AsmOperandClass { + let Name = "SrcIdx16"; + let SuperClasses = [X86Mem16AsmOperand]; + } + def X86SrcIdx32Operand : AsmOperandClass { + let Name = "SrcIdx32"; + let SuperClasses = [X86Mem32AsmOperand]; + } + def X86SrcIdx64Operand : AsmOperandClass { + let Name = "SrcIdx64"; + let SuperClasses = [X86Mem64AsmOperand]; + } +} // RenderMethod = "addSrcIdxOperands" + +let RenderMethod = "addDstIdxOperands" in { + def X86DstIdx8Operand : AsmOperandClass { + let Name = "DstIdx8"; + let SuperClasses = [X86Mem8AsmOperand]; + } + def X86DstIdx16Operand : AsmOperandClass { + let Name = "DstIdx16"; + let SuperClasses = [X86Mem16AsmOperand]; + } + def X86DstIdx32Operand : AsmOperandClass { + let Name = "DstIdx32"; + let SuperClasses = [X86Mem32AsmOperand]; + } + def X86DstIdx64Operand : AsmOperandClass { + let Name = "DstIdx64"; + let SuperClasses = [X86Mem64AsmOperand]; + } +} // RenderMethod = "addDstIdxOperands" + +let RenderMethod = "addMemOffsOperands" in { + def X86MemOffs16_8AsmOperand : AsmOperandClass { + let Name = "MemOffs16_8"; + let SuperClasses = [X86Mem8AsmOperand]; + } + def X86MemOffs16_16AsmOperand : AsmOperandClass { + let Name = "MemOffs16_16"; + let SuperClasses = [X86Mem16AsmOperand]; + } + def X86MemOffs16_32AsmOperand : AsmOperandClass { + let Name = "MemOffs16_32"; + let SuperClasses = [X86Mem32AsmOperand]; + } + def X86MemOffs32_8AsmOperand : AsmOperandClass { + let Name = "MemOffs32_8"; + let SuperClasses = [X86Mem8AsmOperand]; + } + def X86MemOffs32_16AsmOperand : AsmOperandClass { + let Name = "MemOffs32_16"; + let SuperClasses = [X86Mem16AsmOperand]; + } + def X86MemOffs32_32AsmOperand : AsmOperandClass { + let Name = "MemOffs32_32"; + let SuperClasses = [X86Mem32AsmOperand]; + } + def X86MemOffs32_64AsmOperand : AsmOperandClass { + let Name = "MemOffs32_64"; + let SuperClasses = [X86Mem64AsmOperand]; + } + def X86MemOffs64_8AsmOperand : AsmOperandClass { + let Name = "MemOffs64_8"; + let SuperClasses = [X86Mem8AsmOperand]; + } + def X86MemOffs64_16AsmOperand : AsmOperandClass { + let Name = "MemOffs64_16"; + let SuperClasses = [X86Mem16AsmOperand]; + } + def X86MemOffs64_32AsmOperand : AsmOperandClass { + let Name = "MemOffs64_32"; + let SuperClasses = [X86Mem32AsmOperand]; + } + def X86MemOffs64_64AsmOperand : AsmOperandClass { + let Name = "MemOffs64_64"; + let SuperClasses = [X86Mem64AsmOperand]; + } +} // RenderMethod = "addMemOffsOperands" + +class X86SrcIdxOperand<string printMethod, AsmOperandClass parserMatchClass> + : X86MemOperand<printMethod, parserMatchClass> { + let MIOperandInfo = (ops ptr_rc, SEGMENT_REG); +} + +class X86DstIdxOperand<string printMethod, AsmOperandClass parserMatchClass> + : X86MemOperand<printMethod, parserMatchClass> { + let MIOperandInfo = (ops ptr_rc); +} + +def srcidx8 : X86SrcIdxOperand<"printSrcIdx8", X86SrcIdx8Operand>; +def srcidx16 : X86SrcIdxOperand<"printSrcIdx16", X86SrcIdx16Operand>; +def srcidx32 : X86SrcIdxOperand<"printSrcIdx32", X86SrcIdx32Operand>; +def srcidx64 : X86SrcIdxOperand<"printSrcIdx64", X86SrcIdx64Operand>; +def dstidx8 : X86DstIdxOperand<"printDstIdx8", X86DstIdx8Operand>; +def dstidx16 : X86DstIdxOperand<"printDstIdx16", X86DstIdx16Operand>; +def dstidx32 : X86DstIdxOperand<"printDstIdx32", X86DstIdx32Operand>; +def dstidx64 : X86DstIdxOperand<"printDstIdx64", X86DstIdx64Operand>; + +class X86MemOffsOperand<Operand immOperand, string printMethod, + AsmOperandClass parserMatchClass> + : X86MemOperand<printMethod, parserMatchClass> { + let MIOperandInfo = (ops immOperand, SEGMENT_REG); +} + +def offset16_8 : X86MemOffsOperand<i16imm, "printMemOffs8", + X86MemOffs16_8AsmOperand>; +def offset16_16 : X86MemOffsOperand<i16imm, "printMemOffs16", + X86MemOffs16_16AsmOperand>; +def offset16_32 : X86MemOffsOperand<i16imm, "printMemOffs32", + X86MemOffs16_32AsmOperand>; +def offset32_8 : X86MemOffsOperand<i32imm, "printMemOffs8", + X86MemOffs32_8AsmOperand>; +def offset32_16 : X86MemOffsOperand<i32imm, "printMemOffs16", + X86MemOffs32_16AsmOperand>; +def offset32_32 : X86MemOffsOperand<i32imm, "printMemOffs32", + X86MemOffs32_32AsmOperand>; +def offset32_64 : X86MemOffsOperand<i32imm, "printMemOffs64", + X86MemOffs32_64AsmOperand>; +def offset64_8 : X86MemOffsOperand<i64imm, "printMemOffs8", + X86MemOffs64_8AsmOperand>; +def offset64_16 : X86MemOffsOperand<i64imm, "printMemOffs16", + X86MemOffs64_16AsmOperand>; +def offset64_32 : X86MemOffsOperand<i64imm, "printMemOffs32", + X86MemOffs64_32AsmOperand>; +def offset64_64 : X86MemOffsOperand<i64imm, "printMemOffs64", + X86MemOffs64_64AsmOperand>; + +def SSECC : Operand<i8> { + let PrintMethod = "printSSEAVXCC"; + let OperandType = "OPERAND_IMMEDIATE"; +} + +def AVXCC : Operand<i8> { + let PrintMethod = "printSSEAVXCC"; + let OperandType = "OPERAND_IMMEDIATE"; +} + +def AVX512ICC : Operand<i8> { + let PrintMethod = "printSSEAVXCC"; + let OperandType = "OPERAND_IMMEDIATE"; +} + +def XOPCC : Operand<i8> { + let PrintMethod = "printXOPCC"; + let OperandType = "OPERAND_IMMEDIATE"; +} + +class ImmSExtAsmOperandClass : AsmOperandClass { + let SuperClasses = [ImmAsmOperand]; + let RenderMethod = "addImmOperands"; +} + +def X86GR32orGR64AsmOperand : AsmOperandClass { + let Name = "GR32orGR64"; +} + +def GR32orGR64 : RegisterOperand<GR32> { + let ParserMatchClass = X86GR32orGR64AsmOperand; +} +def AVX512RCOperand : AsmOperandClass { + let Name = "AVX512RC"; +} +def AVX512RC : Operand<i32> { + let PrintMethod = "printRoundingControl"; + let OperandType = "OPERAND_IMMEDIATE"; + let ParserMatchClass = AVX512RCOperand; +} + +// Sign-extended immediate classes. We don't need to define the full lattice +// here because there is no instruction with an ambiguity between ImmSExti64i32 +// and ImmSExti32i8. +// +// The strange ranges come from the fact that the assembler always works with +// 64-bit immediates, but for a 16-bit target value we want to accept both "-1" +// (which will be a -1ULL), and "0xFF" (-1 in 16-bits). + +// [0, 0x7FFFFFFF] | +// [0xFFFFFFFF80000000, 0xFFFFFFFFFFFFFFFF] +def ImmSExti64i32AsmOperand : ImmSExtAsmOperandClass { + let Name = "ImmSExti64i32"; +} + +// [0, 0x0000007F] | [0x000000000000FF80, 0x000000000000FFFF] | +// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF] +def ImmSExti16i8AsmOperand : ImmSExtAsmOperandClass { + let Name = "ImmSExti16i8"; + let SuperClasses = [ImmSExti64i32AsmOperand]; +} + +// [0, 0x0000007F] | [0x00000000FFFFFF80, 0x00000000FFFFFFFF] | +// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF] +def ImmSExti32i8AsmOperand : ImmSExtAsmOperandClass { + let Name = "ImmSExti32i8"; +} + +// [0, 0x0000007F] | +// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF] +def ImmSExti64i8AsmOperand : ImmSExtAsmOperandClass { + let Name = "ImmSExti64i8"; + let SuperClasses = [ImmSExti16i8AsmOperand, ImmSExti32i8AsmOperand, + ImmSExti64i32AsmOperand]; +} + +// Unsigned immediate used by SSE/AVX instructions +// [0, 0xFF] +// [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF] +def ImmUnsignedi8AsmOperand : AsmOperandClass { + let Name = "ImmUnsignedi8"; + let RenderMethod = "addImmOperands"; +} + +// A couple of more descriptive operand definitions. +// 16-bits but only 8 bits are significant. +def i16i8imm : Operand<i16> { + let ParserMatchClass = ImmSExti16i8AsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; +} +// 32-bits but only 8 bits are significant. +def i32i8imm : Operand<i32> { + let ParserMatchClass = ImmSExti32i8AsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; +} + +// 64-bits but only 32 bits are significant. +def i64i32imm : Operand<i64> { + let ParserMatchClass = ImmSExti64i32AsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; +} + +// 64-bits but only 8 bits are significant. +def i64i8imm : Operand<i64> { + let ParserMatchClass = ImmSExti64i8AsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; +} + +// Unsigned 8-bit immediate used by SSE/AVX instructions. +def u8imm : Operand<i8> { + let PrintMethod = "printU8Imm"; + let ParserMatchClass = ImmUnsignedi8AsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; +} + +// 32-bit immediate but only 8-bits are significant and they are unsigned. +// Used by some SSE/AVX instructions that use intrinsics. +def i32u8imm : Operand<i32> { + let PrintMethod = "printU8Imm"; + let ParserMatchClass = ImmUnsignedi8AsmOperand; + let OperandType = "OPERAND_IMMEDIATE"; +} + +// 64-bits but only 32 bits are significant, and those bits are treated as being +// pc relative. +def i64i32imm_pcrel : Operand<i64> { + let PrintMethod = "printPCRelImm"; + let ParserMatchClass = X86AbsMemAsmOperand; + let OperandType = "OPERAND_PCREL"; +} + +def lea64_32mem : Operand<i32> { + let PrintMethod = "printanymem"; + let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, SEGMENT_REG); + let ParserMatchClass = X86MemAsmOperand; +} + +// Memory operands that use 64-bit pointers in both ILP32 and LP64. +def lea64mem : Operand<i64> { + let PrintMethod = "printanymem"; + let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, SEGMENT_REG); + let ParserMatchClass = X86MemAsmOperand; +} + + +//===----------------------------------------------------------------------===// +// X86 Complex Pattern Definitions. +// + +// Define X86-specific addressing mode. +def addr : ComplexPattern<iPTR, 5, "selectAddr", [], [SDNPWantParent]>; +def lea32addr : ComplexPattern<i32, 5, "selectLEAAddr", + [add, sub, mul, X86mul_imm, shl, or, frameindex], + []>; +// In 64-bit mode 32-bit LEAs can use RIP-relative addressing. +def lea64_32addr : ComplexPattern<i32, 5, "selectLEA64_32Addr", + [add, sub, mul, X86mul_imm, shl, or, + frameindex, X86WrapperRIP], + []>; + +def tls32addr : ComplexPattern<i32, 5, "selectTLSADDRAddr", + [tglobaltlsaddr], []>; + +def tls32baseaddr : ComplexPattern<i32, 5, "selectTLSADDRAddr", + [tglobaltlsaddr], []>; + +def lea64addr : ComplexPattern<i64, 5, "selectLEAAddr", + [add, sub, mul, X86mul_imm, shl, or, frameindex, + X86WrapperRIP], []>; + +def tls64addr : ComplexPattern<i64, 5, "selectTLSADDRAddr", + [tglobaltlsaddr], []>; + +def tls64baseaddr : ComplexPattern<i64, 5, "selectTLSADDRAddr", + [tglobaltlsaddr], []>; + +def vectoraddr : ComplexPattern<iPTR, 5, "selectVectorAddr", [],[SDNPWantParent]>; + +// A relocatable immediate is either an immediate operand or an operand that can +// be relocated by the linker to an immediate, such as a regular symbol in +// non-PIC code. +def relocImm : ComplexPattern<iAny, 1, "selectRelocImm", [imm, X86Wrapper], [], + 0>; + +//===----------------------------------------------------------------------===// +// X86 Instruction Predicate Definitions. +def TruePredicate : Predicate<"true">; + +def HasCMov : Predicate<"Subtarget->hasCMov()">; +def NoCMov : Predicate<"!Subtarget->hasCMov()">; + +def HasMMX : Predicate<"Subtarget->hasMMX()">; +def Has3DNow : Predicate<"Subtarget->has3DNow()">; +def Has3DNowA : Predicate<"Subtarget->has3DNowA()">; +def HasSSE1 : Predicate<"Subtarget->hasSSE1()">; +def UseSSE1 : Predicate<"Subtarget->hasSSE1() && !Subtarget->hasAVX()">; +def HasSSE2 : Predicate<"Subtarget->hasSSE2()">; +def UseSSE2 : Predicate<"Subtarget->hasSSE2() && !Subtarget->hasAVX()">; +def HasSSE3 : Predicate<"Subtarget->hasSSE3()">; +def UseSSE3 : Predicate<"Subtarget->hasSSE3() && !Subtarget->hasAVX()">; +def HasSSSE3 : Predicate<"Subtarget->hasSSSE3()">; +def UseSSSE3 : Predicate<"Subtarget->hasSSSE3() && !Subtarget->hasAVX()">; +def HasSSE41 : Predicate<"Subtarget->hasSSE41()">; +def NoSSE41 : Predicate<"!Subtarget->hasSSE41()">; +def UseSSE41 : Predicate<"Subtarget->hasSSE41() && !Subtarget->hasAVX()">; +def HasSSE42 : Predicate<"Subtarget->hasSSE42()">; +def UseSSE42 : Predicate<"Subtarget->hasSSE42() && !Subtarget->hasAVX()">; +def HasSSE4A : Predicate<"Subtarget->hasSSE4A()">; +def NoAVX : Predicate<"!Subtarget->hasAVX()">; +def HasAVX : Predicate<"Subtarget->hasAVX()">; +def HasAVX2 : Predicate<"Subtarget->hasAVX2()">; +def HasAVX1Only : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX2()">; +def HasAVX512 : Predicate<"Subtarget->hasAVX512()">; +def UseAVX : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX512()">; +def UseAVX2 : Predicate<"Subtarget->hasAVX2() && !Subtarget->hasAVX512()">; +def NoAVX512 : Predicate<"!Subtarget->hasAVX512()">; +def HasCDI : Predicate<"Subtarget->hasCDI()">; +def HasVPOPCNTDQ : Predicate<"Subtarget->hasVPOPCNTDQ()">; +def HasPFI : Predicate<"Subtarget->hasPFI()">; +def HasERI : Predicate<"Subtarget->hasERI()">; +def HasDQI : Predicate<"Subtarget->hasDQI()">; +def NoDQI : Predicate<"!Subtarget->hasDQI()">; +def HasBWI : Predicate<"Subtarget->hasBWI()">; +def NoBWI : Predicate<"!Subtarget->hasBWI()">; +def HasVLX : Predicate<"Subtarget->hasVLX()">; +def NoVLX : Predicate<"!Subtarget->hasVLX()">; +def NoVLX_Or_NoBWI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasBWI()">; +def NoVLX_Or_NoDQI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasDQI()">; +def PKU : Predicate<"Subtarget->hasPKU()">; +def HasVNNI : Predicate<"Subtarget->hasVNNI()">; + +def HasBITALG : Predicate<"Subtarget->hasBITALG()">; +def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">; +def HasAES : Predicate<"Subtarget->hasAES()">; +def HasVAES : Predicate<"Subtarget->hasVAES()">; +def NoVLX_Or_NoVAES : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasVAES()">; +def HasFXSR : Predicate<"Subtarget->hasFXSR()">; +def HasXSAVE : Predicate<"Subtarget->hasXSAVE()">; +def HasXSAVEOPT : Predicate<"Subtarget->hasXSAVEOPT()">; +def HasXSAVEC : Predicate<"Subtarget->hasXSAVEC()">; +def HasXSAVES : Predicate<"Subtarget->hasXSAVES()">; +def HasPCLMUL : Predicate<"Subtarget->hasPCLMUL()">; +def NoVLX_Or_NoVPCLMULQDQ : + Predicate<"!Subtarget->hasVLX() || !Subtarget->hasVPCLMULQDQ()">; +def HasVPCLMULQDQ : Predicate<"Subtarget->hasVPCLMULQDQ()">; +def HasGFNI : Predicate<"Subtarget->hasGFNI()">; +def HasFMA : Predicate<"Subtarget->hasFMA()">; +def HasFMA4 : Predicate<"Subtarget->hasFMA4()">; +def NoFMA4 : Predicate<"!Subtarget->hasFMA4()">; +def HasXOP : Predicate<"Subtarget->hasXOP()">; +def HasTBM : Predicate<"Subtarget->hasTBM()">; +def NoTBM : Predicate<"!Subtarget->hasTBM()">; +def HasLWP : Predicate<"Subtarget->hasLWP()">; +def HasMOVBE : Predicate<"Subtarget->hasMOVBE()">; +def HasRDRAND : Predicate<"Subtarget->hasRDRAND()">; +def HasF16C : Predicate<"Subtarget->hasF16C()">; +def HasFSGSBase : Predicate<"Subtarget->hasFSGSBase()">; +def HasLZCNT : Predicate<"Subtarget->hasLZCNT()">; +def HasBMI : Predicate<"Subtarget->hasBMI()">; +def HasBMI2 : Predicate<"Subtarget->hasBMI2()">; +def NoBMI2 : Predicate<"!Subtarget->hasBMI2()">; +def HasVBMI : Predicate<"Subtarget->hasVBMI()">; +def HasVBMI2 : Predicate<"Subtarget->hasVBMI2()">; +def HasIFMA : Predicate<"Subtarget->hasIFMA()">; +def HasRTM : Predicate<"Subtarget->hasRTM()">; +def HasADX : Predicate<"Subtarget->hasADX()">; +def HasSHA : Predicate<"Subtarget->hasSHA()">; +def HasSGX : Predicate<"Subtarget->hasSGX()">; +def HasPRFCHW : Predicate<"Subtarget->hasPRFCHW()">; +def HasRDSEED : Predicate<"Subtarget->hasRDSEED()">; +def HasSSEPrefetch : Predicate<"Subtarget->hasSSEPrefetch()">; +def NoSSEPrefetch : Predicate<"!Subtarget->hasSSEPrefetch()">; +def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">; +def HasPREFETCHWT1 : Predicate<"Subtarget->hasPREFETCHWT1()">; +def HasLAHFSAHF : Predicate<"Subtarget->hasLAHFSAHF()">; +def HasMWAITX : Predicate<"Subtarget->hasMWAITX()">; +def HasCLZERO : Predicate<"Subtarget->hasCLZERO()">; +def HasCLDEMOTE : Predicate<"Subtarget->hasCLDEMOTE()">; +def HasMOVDIRI : Predicate<"Subtarget->hasMOVDIRI()">; +def HasMOVDIR64B : Predicate<"Subtarget->hasMOVDIR64B()">; +def HasPTWRITE : Predicate<"Subtarget->hasPTWRITE()">; +def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">; +def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">; +def HasMPX : Predicate<"Subtarget->hasMPX()">; +def HasSHSTK : Predicate<"Subtarget->hasSHSTK()">; +def HasCLFLUSHOPT : Predicate<"Subtarget->hasCLFLUSHOPT()">; +def HasCLWB : Predicate<"Subtarget->hasCLWB()">; +def HasWBNOINVD : Predicate<"Subtarget->hasWBNOINVD()">; +def HasRDPID : Predicate<"Subtarget->hasRDPID()">; +def HasWAITPKG : Predicate<"Subtarget->hasWAITPKG()">; +def HasINVPCID : Predicate<"Subtarget->hasINVPCID()">; +def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">; +def HasPCONFIG : Predicate<"Subtarget->hasPCONFIG()">; +def Not64BitMode : Predicate<"!Subtarget->is64Bit()">, + AssemblerPredicate<"!Mode64Bit", "Not 64-bit mode">; +def In64BitMode : Predicate<"Subtarget->is64Bit()">, + AssemblerPredicate<"Mode64Bit", "64-bit mode">; +def IsLP64 : Predicate<"Subtarget->isTarget64BitLP64()">; +def NotLP64 : Predicate<"!Subtarget->isTarget64BitLP64()">; +def In16BitMode : Predicate<"Subtarget->is16Bit()">, + AssemblerPredicate<"Mode16Bit", "16-bit mode">; +def Not16BitMode : Predicate<"!Subtarget->is16Bit()">, + AssemblerPredicate<"!Mode16Bit", "Not 16-bit mode">; +def In32BitMode : Predicate<"Subtarget->is32Bit()">, + AssemblerPredicate<"Mode32Bit", "32-bit mode">; +def IsWin64 : Predicate<"Subtarget->isTargetWin64()">; +def NotWin64 : Predicate<"!Subtarget->isTargetWin64()">; +def NotWin64WithoutFP : Predicate<"!Subtarget->isTargetWin64() ||" + "Subtarget->getFrameLowering()->hasFP(*MF)"> { + let RecomputePerFunction = 1; +} +def IsPS4 : Predicate<"Subtarget->isTargetPS4()">; +def NotPS4 : Predicate<"!Subtarget->isTargetPS4()">; +def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">; +def NotNaCl : Predicate<"!Subtarget->isTargetNaCl()">; +def SmallCode : Predicate<"TM.getCodeModel() == CodeModel::Small">; +def KernelCode : Predicate<"TM.getCodeModel() == CodeModel::Kernel">; +def NearData : Predicate<"TM.getCodeModel() == CodeModel::Small ||" + "TM.getCodeModel() == CodeModel::Kernel">; +def IsNotPIC : Predicate<"!TM.isPositionIndependent()">; + +// We could compute these on a per-module basis but doing so requires accessing +// the Function object through the <Target>Subtarget and objections were raised +// to that (see post-commit review comments for r301750). +let RecomputePerFunction = 1 in { + def OptForSize : Predicate<"MF->getFunction().optForSize()">; + def OptForMinSize : Predicate<"MF->getFunction().optForMinSize()">; + def OptForSpeed : Predicate<"!MF->getFunction().optForSize()">; + def UseIncDec : Predicate<"!Subtarget->slowIncDec() || " + "MF->getFunction().optForSize()">; + def NoSSE41_Or_OptForSize : Predicate<"MF->getFunction().optForSize() || " + "!Subtarget->hasSSE41()">; +} + +def CallImmAddr : Predicate<"Subtarget->isLegalToCallImmediateAddr()">; +def FavorMemIndirectCall : Predicate<"!Subtarget->slowTwoMemOps()">; +def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">; +def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">; +def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">; +def HasERMSB : Predicate<"Subtarget->hasERMSB()">; +def HasMFence : Predicate<"Subtarget->hasMFence()">; +def UseRetpoline : Predicate<"Subtarget->useRetpoline()">; +def NotUseRetpoline : Predicate<"!Subtarget->useRetpoline()">; + +//===----------------------------------------------------------------------===// +// X86 Instruction Format Definitions. +// + +include "X86InstrFormats.td" + +//===----------------------------------------------------------------------===// +// Pattern fragments. +// + +// X86 specific condition code. These correspond to CondCode in +// X86InstrInfo.h. They must be kept in synch. +def X86_COND_A : PatLeaf<(i8 0)>; // alt. COND_NBE +def X86_COND_AE : PatLeaf<(i8 1)>; // alt. COND_NC +def X86_COND_B : PatLeaf<(i8 2)>; // alt. COND_C +def X86_COND_BE : PatLeaf<(i8 3)>; // alt. COND_NA +def X86_COND_E : PatLeaf<(i8 4)>; // alt. COND_Z +def X86_COND_G : PatLeaf<(i8 5)>; // alt. COND_NLE +def X86_COND_GE : PatLeaf<(i8 6)>; // alt. COND_NL +def X86_COND_L : PatLeaf<(i8 7)>; // alt. COND_NGE +def X86_COND_LE : PatLeaf<(i8 8)>; // alt. COND_NG +def X86_COND_NE : PatLeaf<(i8 9)>; // alt. COND_NZ +def X86_COND_NO : PatLeaf<(i8 10)>; +def X86_COND_NP : PatLeaf<(i8 11)>; // alt. COND_PO +def X86_COND_NS : PatLeaf<(i8 12)>; +def X86_COND_O : PatLeaf<(i8 13)>; +def X86_COND_P : PatLeaf<(i8 14)>; // alt. COND_PE +def X86_COND_S : PatLeaf<(i8 15)>; + +def i16immSExt8 : ImmLeaf<i16, [{ return isInt<8>(Imm); }]>; +def i32immSExt8 : ImmLeaf<i32, [{ return isInt<8>(Imm); }]>; +def i64immSExt8 : ImmLeaf<i64, [{ return isInt<8>(Imm); }]>; +def i64immSExt32 : ImmLeaf<i64, [{ return isInt<32>(Imm); }]>; + +// FIXME: Ideally we would just replace the above i*immSExt* matchers with +// relocImm-based matchers, but then FastISel would be unable to use them. +def i64relocImmSExt8 : PatLeaf<(i64 relocImm), [{ + return isSExtRelocImm<8>(N); +}]>; +def i64relocImmSExt32 : PatLeaf<(i64 relocImm), [{ + return isSExtRelocImm<32>(N); +}]>; + +// If we have multiple users of an immediate, it's much smaller to reuse +// the register, rather than encode the immediate in every instruction. +// This has the risk of increasing register pressure from stretched live +// ranges, however, the immediates should be trivial to rematerialize by +// the RA in the event of high register pressure. +// TODO : This is currently enabled for stores and binary ops. There are more +// cases for which this can be enabled, though this catches the bulk of the +// issues. +// TODO2 : This should really also be enabled under O2, but there's currently +// an issue with RA where we don't pull the constants into their users +// when we rematerialize them. I'll follow-up on enabling O2 after we fix that +// issue. +// TODO3 : This is currently limited to single basic blocks (DAG creation +// pulls block immediates to the top and merges them if necessary). +// Eventually, it would be nice to allow ConstantHoisting to merge constants +// globally for potentially added savings. +// +def imm8_su : PatLeaf<(i8 relocImm), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; +def imm16_su : PatLeaf<(i16 relocImm), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; +def imm32_su : PatLeaf<(i32 relocImm), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; +def i64immSExt32_su : PatLeaf<(i64immSExt32), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; + +def i16immSExt8_su : PatLeaf<(i16immSExt8), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; +def i32immSExt8_su : PatLeaf<(i32immSExt8), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; +def i64immSExt8_su : PatLeaf<(i64immSExt8), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; + +def i64relocImmSExt8_su : PatLeaf<(i64relocImmSExt8), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; +def i64relocImmSExt32_su : PatLeaf<(i64relocImmSExt32), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; + +// i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit +// unsigned field. +def i64immZExt32 : ImmLeaf<i64, [{ return isUInt<32>(Imm); }]>; + +def i64immZExt32SExt8 : ImmLeaf<i64, [{ + return isUInt<32>(Imm) && isInt<8>(static_cast<int32_t>(Imm)); +}]>; + +// Helper fragments for loads. + +// It's safe to fold a zextload/extload from i1 as a regular i8 load. The +// upper bits are guaranteed to be zero and we were going to emit a MOV8rm +// which might get folded during peephole anyway. +def loadi8 : PatFrag<(ops node:$ptr), (i8 (unindexedload node:$ptr)), [{ + LoadSDNode *LD = cast<LoadSDNode>(N); + ISD::LoadExtType ExtType = LD->getExtensionType(); + return ExtType == ISD::NON_EXTLOAD || ExtType == ISD::EXTLOAD || + ExtType == ISD::ZEXTLOAD; +}]>; + +// It's always safe to treat a anyext i16 load as a i32 load if the i16 is +// known to be 32-bit aligned or better. Ditto for i8 to i16. +def loadi16 : PatFrag<(ops node:$ptr), (i16 (unindexedload node:$ptr)), [{ + LoadSDNode *LD = cast<LoadSDNode>(N); + ISD::LoadExtType ExtType = LD->getExtensionType(); + if (ExtType == ISD::NON_EXTLOAD) + return true; + if (ExtType == ISD::EXTLOAD) + return LD->getAlignment() >= 2 && !LD->isVolatile(); + return false; +}]>; + +def loadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{ + LoadSDNode *LD = cast<LoadSDNode>(N); + ISD::LoadExtType ExtType = LD->getExtensionType(); + if (ExtType == ISD::NON_EXTLOAD) + return true; + if (ExtType == ISD::EXTLOAD) + return LD->getAlignment() >= 4 && !LD->isVolatile(); + return false; +}]>; + +def loadi64 : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>; +def loadf32 : PatFrag<(ops node:$ptr), (f32 (load node:$ptr))>; +def loadf64 : PatFrag<(ops node:$ptr), (f64 (load node:$ptr))>; +def loadf80 : PatFrag<(ops node:$ptr), (f80 (load node:$ptr))>; +def loadf128 : PatFrag<(ops node:$ptr), (f128 (load node:$ptr))>; +def alignedloadf128 : PatFrag<(ops node:$ptr), (f128 (load node:$ptr)), [{ + LoadSDNode *Ld = cast<LoadSDNode>(N); + return Ld->getAlignment() >= Ld->getMemoryVT().getStoreSize(); +}]>; +def memopf128 : PatFrag<(ops node:$ptr), (f128 (load node:$ptr)), [{ + LoadSDNode *Ld = cast<LoadSDNode>(N); + return Subtarget->hasSSEUnalignedMem() || + Ld->getAlignment() >= Ld->getMemoryVT().getStoreSize(); +}]>; + +def sextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (sextloadi8 node:$ptr))>; +def sextloadi32i8 : PatFrag<(ops node:$ptr), (i32 (sextloadi8 node:$ptr))>; +def sextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (sextloadi16 node:$ptr))>; +def sextloadi64i8 : PatFrag<(ops node:$ptr), (i64 (sextloadi8 node:$ptr))>; +def sextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (sextloadi16 node:$ptr))>; +def sextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (sextloadi32 node:$ptr))>; + +def zextloadi8i1 : PatFrag<(ops node:$ptr), (i8 (zextloadi1 node:$ptr))>; +def zextloadi16i1 : PatFrag<(ops node:$ptr), (i16 (zextloadi1 node:$ptr))>; +def zextloadi32i1 : PatFrag<(ops node:$ptr), (i32 (zextloadi1 node:$ptr))>; +def zextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (zextloadi8 node:$ptr))>; +def zextloadi32i8 : PatFrag<(ops node:$ptr), (i32 (zextloadi8 node:$ptr))>; +def zextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (zextloadi16 node:$ptr))>; +def zextloadi64i1 : PatFrag<(ops node:$ptr), (i64 (zextloadi1 node:$ptr))>; +def zextloadi64i8 : PatFrag<(ops node:$ptr), (i64 (zextloadi8 node:$ptr))>; +def zextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (zextloadi16 node:$ptr))>; +def zextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (zextloadi32 node:$ptr))>; + +def extloadi8i1 : PatFrag<(ops node:$ptr), (i8 (extloadi1 node:$ptr))>; +def extloadi16i1 : PatFrag<(ops node:$ptr), (i16 (extloadi1 node:$ptr))>; +def extloadi32i1 : PatFrag<(ops node:$ptr), (i32 (extloadi1 node:$ptr))>; +def extloadi16i8 : PatFrag<(ops node:$ptr), (i16 (extloadi8 node:$ptr))>; +def extloadi32i8 : PatFrag<(ops node:$ptr), (i32 (extloadi8 node:$ptr))>; +def extloadi32i16 : PatFrag<(ops node:$ptr), (i32 (extloadi16 node:$ptr))>; +def extloadi64i1 : PatFrag<(ops node:$ptr), (i64 (extloadi1 node:$ptr))>; +def extloadi64i8 : PatFrag<(ops node:$ptr), (i64 (extloadi8 node:$ptr))>; +def extloadi64i16 : PatFrag<(ops node:$ptr), (i64 (extloadi16 node:$ptr))>; +def extloadi64i32 : PatFrag<(ops node:$ptr), (i64 (extloadi32 node:$ptr))>; + + +// An 'and' node with a single use. +def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{ + return N->hasOneUse(); +}]>; +// An 'srl' node with a single use. +def srl_su : PatFrag<(ops node:$lhs, node:$rhs), (srl node:$lhs, node:$rhs), [{ + return N->hasOneUse(); +}]>; +// An 'trunc' node with a single use. +def trunc_su : PatFrag<(ops node:$src), (trunc node:$src), [{ + return N->hasOneUse(); +}]>; + +//===----------------------------------------------------------------------===// +// Instruction list. +// + +// Nop +let hasSideEffects = 0, SchedRW = [WriteNop] in { + def NOOP : I<0x90, RawFrm, (outs), (ins), "nop", []>; + def NOOPW : I<0x1f, MRMXm, (outs), (ins i16mem:$zero), + "nop{w}\t$zero", []>, TB, OpSize16, NotMemoryFoldable; + def NOOPL : I<0x1f, MRMXm, (outs), (ins i32mem:$zero), + "nop{l}\t$zero", []>, TB, OpSize32, NotMemoryFoldable; + def NOOPQ : RI<0x1f, MRMXm, (outs), (ins i64mem:$zero), + "nop{q}\t$zero", []>, TB, NotMemoryFoldable, + Requires<[In64BitMode]>; + // Also allow register so we can assemble/disassemble + def NOOPWr : I<0x1f, MRMXr, (outs), (ins GR16:$zero), + "nop{w}\t$zero", []>, TB, OpSize16, NotMemoryFoldable; + def NOOPLr : I<0x1f, MRMXr, (outs), (ins GR32:$zero), + "nop{l}\t$zero", []>, TB, OpSize32, NotMemoryFoldable; + def NOOPQr : RI<0x1f, MRMXr, (outs), (ins GR64:$zero), + "nop{q}\t$zero", []>, TB, NotMemoryFoldable, + Requires<[In64BitMode]>; + def NOOPW_19 : I<0x19, MRMXm, (outs), (ins i16mem:$zero), + "nop{w}\t$zero", []>, TB, OpSize16; + def NOOPL_19 : I<0x19, MRMXm, (outs), (ins i32mem:$zero), + "nop{l}\t$zero", []>, TB, OpSize32; + //def NOOPW_1a : I<0x1a, MRMXm, (outs), (ins i16mem:$zero), + // "nop{w}\t$zero", []>, TB, OpSize16; + //def NOOPL_1a : I<0x1a, MRMXm, (outs), (ins i32mem:$zero), + // "nop{l}\t$zero", []>, TB, OpSize32; + //def NOOPW_1b : I<0x1b, MRMXm, (outs), (ins i16mem:$zero), + // "nop{w}\t$zero", []>, TB, OpSize16; + //def NOOPL_1b : I<0x1b, MRMXm, (outs), (ins i32mem:$zero), + // "nop{l}\t$zero", []>, TB, OpSize32; + def NOOPW_1c : I<0x1c, MRMXm, (outs), (ins i16mem:$zero), + "nop{w}\t$zero", []>, TB, OpSize16; + //def NOOPL_1c : I<0x1c, MRMXm, (outs), (ins i32mem:$zero), + // "nop{l}\t$zero", []>, TB, OpSize32; + def NOOPW_1d : I<0x1d, MRMXm, (outs), (ins i16mem:$zero), + "nop{w}\t$zero", []>, TB, OpSize16; + def NOOPL_1d : I<0x1d, MRMXm, (outs), (ins i32mem:$zero), + "nop{l}\t$zero", []>, TB, OpSize32; + def NOOPW_1e : I<0x1e, MRMXm, (outs), (ins i16mem:$zero), + "nop{w}\t$zero", []>, TB, OpSize16; + def NOOPL_1e : I<0x1e, MRMXm, (outs), (ins i32mem:$zero), + "nop{l}\t$zero", []>, TB, OpSize32; + + def NOOP18_16m4 : I<0x18, MRM4m, (outs), (ins i16mem:$zero), + "nop{w}\t$zero", []>, TB, OpSize16; + def NOOP18_m4 : I<0x18, MRM4m, (outs), (ins i32mem:$zero), + "nop{l}\t$zero", []>, TB, OpSize32; + + def NOOP18_16r4 : I<0x18, MRM4r, (outs), (ins GR16:$zero), + "nop{w}\t$zero", []>, TB, OpSize16; + def NOOP18_r4 : I<0x18, MRM4r, (outs), (ins GR32:$zero), + "nop{l}\t$zero", []>, TB, OpSize32; + + def NOOP18_16m5 : I<0x18, MRM5m, (outs), (ins i16mem:$zero), + "nop{w}\t$zero", []>, TB, OpSize16; + def NOOP18_m5 : I<0x18, MRM5m, (outs), (ins i32mem:$zero), + "nop{l}\t$zero", []>, TB, OpSize32; + + def NOOP18_16r5 : I<0x18, MRM5r, (outs), (ins GR16:$zero), + "nop{w}\t$zero", []>, TB, OpSize16; + def NOOP18_r5 : I<0x18, MRM5r, (outs), (ins GR32:$zero), + "nop{l}\t$zero", []>, TB, OpSize32; + + def NOOP18_16m6 : I<0x18, MRM6m, (outs), (ins i16mem:$zero), + "nop{w}\t$zero", []>, TB, OpSize16; + def NOOP18_m6 : I<0x18, MRM6m, (outs), (ins i32mem:$zero), + "nop{l}\t$zero", []>, TB, OpSize32; + + def NOOP18_16r6 : I<0x18, MRM6r, (outs), (ins GR16:$zero), + "nop{w}\t$zero", []>, TB, OpSize16; + def NOOP18_r6 : I<0x18, MRM6r, (outs), (ins GR32:$zero), + "nop{l}\t$zero", []>, TB, OpSize32; + + def NOOP18_16m7 : I<0x18, MRM7m, (outs), (ins i16mem:$zero), + "nop{w}\t$zero", []>, TB, OpSize16; + def NOOP18_m7 : I<0x18, MRM7m, (outs), (ins i32mem:$zero), + "nop{l}\t$zero", []>, TB, OpSize32; + + def NOOP18_16r7 : I<0x18, MRM7r, (outs), (ins GR16:$zero), + "nop{w}\t$zero", []>, TB, OpSize16; + def NOOP18_r7 : I<0x18, MRM7r, (outs), (ins GR32:$zero), + "nop{l}\t$zero", []>, TB, OpSize32; +} + + +// Constructing a stack frame. +def ENTER : Ii16<0xC8, RawFrmImm8, (outs), (ins i16imm:$len, i8imm:$lvl), + "enter\t$len, $lvl", []>, Sched<[WriteMicrocoded]>; + +let SchedRW = [WriteALU] in { +let Defs = [EBP, ESP], Uses = [EBP, ESP], mayLoad = 1, hasSideEffects=0 in +def LEAVE : I<0xC9, RawFrm, (outs), (ins), "leave", []>, + Requires<[Not64BitMode]>; + +let Defs = [RBP,RSP], Uses = [RBP,RSP], mayLoad = 1, hasSideEffects = 0 in +def LEAVE64 : I<0xC9, RawFrm, (outs), (ins), "leave", []>, + Requires<[In64BitMode]>; +} // SchedRW + +//===----------------------------------------------------------------------===// +// Miscellaneous Instructions. +// + +let isBarrier = 1, hasSideEffects = 1, usesCustomInserter = 1, + SchedRW = [WriteSystem] in + def Int_eh_sjlj_setup_dispatch + : PseudoI<(outs), (ins), [(X86eh_sjlj_setup_dispatch)]>; + +let Defs = [ESP], Uses = [ESP], hasSideEffects=0 in { +let mayLoad = 1, SchedRW = [WriteLoad] in { +def POP16r : I<0x58, AddRegFrm, (outs GR16:$reg), (ins), "pop{w}\t$reg", []>, + OpSize16; +def POP32r : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", []>, + OpSize32, Requires<[Not64BitMode]>; +// Long form for the disassembler. +let isCodeGenOnly = 1, ForceDisassemble = 1 in { +def POP16rmr: I<0x8F, MRM0r, (outs GR16:$reg), (ins), "pop{w}\t$reg", []>, + OpSize16, NotMemoryFoldable; +def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", []>, + OpSize32, Requires<[Not64BitMode]>, NotMemoryFoldable; +} // isCodeGenOnly = 1, ForceDisassemble = 1 +} // mayLoad, SchedRW +let mayStore = 1, mayLoad = 1, SchedRW = [WriteRMW] in { +def POP16rmm: I<0x8F, MRM0m, (outs), (ins i16mem:$dst), "pop{w}\t$dst", []>, + OpSize16; +def POP32rmm: I<0x8F, MRM0m, (outs), (ins i32mem:$dst), "pop{l}\t$dst", []>, + OpSize32, Requires<[Not64BitMode]>; +} // mayStore, mayLoad, WriteRMW + +let mayStore = 1, SchedRW = [WriteStore] in { +def PUSH16r : I<0x50, AddRegFrm, (outs), (ins GR16:$reg), "push{w}\t$reg",[]>, + OpSize16; +def PUSH32r : I<0x50, AddRegFrm, (outs), (ins GR32:$reg), "push{l}\t$reg",[]>, + OpSize32, Requires<[Not64BitMode]>; +// Long form for the disassembler. +let isCodeGenOnly = 1, ForceDisassemble = 1 in { +def PUSH16rmr: I<0xFF, MRM6r, (outs), (ins GR16:$reg), "push{w}\t$reg",[]>, + OpSize16, NotMemoryFoldable; +def PUSH32rmr: I<0xFF, MRM6r, (outs), (ins GR32:$reg), "push{l}\t$reg",[]>, + OpSize32, Requires<[Not64BitMode]>, NotMemoryFoldable; +} // isCodeGenOnly = 1, ForceDisassemble = 1 + +def PUSH16i8 : Ii8<0x6a, RawFrm, (outs), (ins i16i8imm:$imm), + "push{w}\t$imm", []>, OpSize16; +def PUSHi16 : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm), + "push{w}\t$imm", []>, OpSize16; + +def PUSH32i8 : Ii8<0x6a, RawFrm, (outs), (ins i32i8imm:$imm), + "push{l}\t$imm", []>, OpSize32, + Requires<[Not64BitMode]>; +def PUSHi32 : Ii32<0x68, RawFrm, (outs), (ins i32imm:$imm), + "push{l}\t$imm", []>, OpSize32, + Requires<[Not64BitMode]>; +} // mayStore, SchedRW + +let mayLoad = 1, mayStore = 1, SchedRW = [WriteRMW] in { +def PUSH16rmm: I<0xFF, MRM6m, (outs), (ins i16mem:$src), "push{w}\t$src", []>, + OpSize16; +def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src", []>, + OpSize32, Requires<[Not64BitMode]>; +} // mayLoad, mayStore, SchedRW + +} + +let mayLoad = 1, mayStore = 1, usesCustomInserter = 1, + SchedRW = [WriteRMW], Defs = [ESP] in { + let Uses = [ESP] in + def RDFLAGS32 : PseudoI<(outs GR32:$dst), (ins), + [(set GR32:$dst, (int_x86_flags_read_u32))]>, + Requires<[Not64BitMode]>; + + let Uses = [RSP] in + def RDFLAGS64 : PseudoI<(outs GR64:$dst), (ins), + [(set GR64:$dst, (int_x86_flags_read_u64))]>, + Requires<[In64BitMode]>; +} + +let mayLoad = 1, mayStore = 1, usesCustomInserter = 1, + SchedRW = [WriteRMW] in { + let Defs = [ESP, EFLAGS, DF], Uses = [ESP] in + def WRFLAGS32 : PseudoI<(outs), (ins GR32:$src), + [(int_x86_flags_write_u32 GR32:$src)]>, + Requires<[Not64BitMode]>; + + let Defs = [RSP, EFLAGS, DF], Uses = [RSP] in + def WRFLAGS64 : PseudoI<(outs), (ins GR64:$src), + [(int_x86_flags_write_u64 GR64:$src)]>, + Requires<[In64BitMode]>; +} + +let Defs = [ESP, EFLAGS, DF], Uses = [ESP], mayLoad = 1, hasSideEffects=0, + SchedRW = [WriteLoad] in { +def POPF16 : I<0x9D, RawFrm, (outs), (ins), "popf{w}", []>, OpSize16; +def POPF32 : I<0x9D, RawFrm, (outs), (ins), "popf{l|d}", []>, OpSize32, + Requires<[Not64BitMode]>; +} + +let Defs = [ESP], Uses = [ESP, EFLAGS, DF], mayStore = 1, hasSideEffects=0, + SchedRW = [WriteStore] in { +def PUSHF16 : I<0x9C, RawFrm, (outs), (ins), "pushf{w}", []>, OpSize16; +def PUSHF32 : I<0x9C, RawFrm, (outs), (ins), "pushf{l|d}", []>, OpSize32, + Requires<[Not64BitMode]>; +} + +let Defs = [RSP], Uses = [RSP], hasSideEffects=0 in { +let mayLoad = 1, SchedRW = [WriteLoad] in { +def POP64r : I<0x58, AddRegFrm, (outs GR64:$reg), (ins), "pop{q}\t$reg", []>, + OpSize32, Requires<[In64BitMode]>; +// Long form for the disassembler. +let isCodeGenOnly = 1, ForceDisassemble = 1 in { +def POP64rmr: I<0x8F, MRM0r, (outs GR64:$reg), (ins), "pop{q}\t$reg", []>, + OpSize32, Requires<[In64BitMode]>, NotMemoryFoldable; +} // isCodeGenOnly = 1, ForceDisassemble = 1 +} // mayLoad, SchedRW +let mayLoad = 1, mayStore = 1, SchedRW = [WriteRMW] in +def POP64rmm: I<0x8F, MRM0m, (outs), (ins i64mem:$dst), "pop{q}\t$dst", []>, + OpSize32, Requires<[In64BitMode]>; +let mayStore = 1, SchedRW = [WriteStore] in { +def PUSH64r : I<0x50, AddRegFrm, (outs), (ins GR64:$reg), "push{q}\t$reg", []>, + OpSize32, Requires<[In64BitMode]>; +// Long form for the disassembler. +let isCodeGenOnly = 1, ForceDisassemble = 1 in { +def PUSH64rmr: I<0xFF, MRM6r, (outs), (ins GR64:$reg), "push{q}\t$reg", []>, + OpSize32, Requires<[In64BitMode]>, NotMemoryFoldable; +} // isCodeGenOnly = 1, ForceDisassemble = 1 +} // mayStore, SchedRW +let mayLoad = 1, mayStore = 1, SchedRW = [WriteRMW] in { +def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", []>, + OpSize32, Requires<[In64BitMode]>; +} // mayLoad, mayStore, SchedRW +} + +let Defs = [RSP], Uses = [RSP], hasSideEffects = 0, mayStore = 1, + SchedRW = [WriteStore] in { +def PUSH64i8 : Ii8<0x6a, RawFrm, (outs), (ins i64i8imm:$imm), + "push{q}\t$imm", []>, OpSize32, + Requires<[In64BitMode]>; +def PUSH64i32 : Ii32S<0x68, RawFrm, (outs), (ins i64i32imm:$imm), + "push{q}\t$imm", []>, OpSize32, + Requires<[In64BitMode]>; +} + +let Defs = [RSP, EFLAGS, DF], Uses = [RSP], mayLoad = 1, hasSideEffects=0 in +def POPF64 : I<0x9D, RawFrm, (outs), (ins), "popfq", []>, + OpSize32, Requires<[In64BitMode]>, Sched<[WriteLoad]>; +let Defs = [RSP], Uses = [RSP, EFLAGS, DF], mayStore = 1, hasSideEffects=0 in +def PUSHF64 : I<0x9C, RawFrm, (outs), (ins), "pushfq", []>, + OpSize32, Requires<[In64BitMode]>, Sched<[WriteStore]>; + +let Defs = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], Uses = [ESP], + mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteLoad] in { +def POPA32 : I<0x61, RawFrm, (outs), (ins), "popal", []>, + OpSize32, Requires<[Not64BitMode]>; +def POPA16 : I<0x61, RawFrm, (outs), (ins), "popaw", []>, + OpSize16, Requires<[Not64BitMode]>; +} +let Defs = [ESP], Uses = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], + mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in { +def PUSHA32 : I<0x60, RawFrm, (outs), (ins), "pushal", []>, + OpSize32, Requires<[Not64BitMode]>; +def PUSHA16 : I<0x60, RawFrm, (outs), (ins), "pushaw", []>, + OpSize16, Requires<[Not64BitMode]>; +} + +let Constraints = "$src = $dst", SchedRW = [WriteBSWAP32] in { +// This instruction is a consequence of BSWAP32r observing operand size. The +// encoding is valid, but the behavior is undefined. +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in +def BSWAP16r_BAD : I<0xC8, AddRegFrm, (outs GR16:$dst), (ins GR16:$src), + "bswap{w}\t$dst", []>, OpSize16, TB; +// GR32 = bswap GR32 +def BSWAP32r : I<0xC8, AddRegFrm, (outs GR32:$dst), (ins GR32:$src), + "bswap{l}\t$dst", + [(set GR32:$dst, (bswap GR32:$src))]>, OpSize32, TB; + +let SchedRW = [WriteBSWAP64] in +def BSWAP64r : RI<0xC8, AddRegFrm, (outs GR64:$dst), (ins GR64:$src), + "bswap{q}\t$dst", + [(set GR64:$dst, (bswap GR64:$src))]>, TB; +} // Constraints = "$src = $dst", SchedRW + +// Bit scan instructions. +let Defs = [EFLAGS] in { +def BSF16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), + "bsf{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))]>, + PS, OpSize16, Sched<[WriteBSF]>; +def BSF16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "bsf{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))]>, + PS, OpSize16, Sched<[WriteBSFLd]>; +def BSF32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "bsf{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))]>, + PS, OpSize32, Sched<[WriteBSF]>; +def BSF32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "bsf{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))]>, + PS, OpSize32, Sched<[WriteBSFLd]>; +def BSF64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), + "bsf{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))]>, + PS, Sched<[WriteBSF]>; +def BSF64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "bsf{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))]>, + PS, Sched<[WriteBSFLd]>; + +def BSR16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), + "bsr{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))]>, + PS, OpSize16, Sched<[WriteBSR]>; +def BSR16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "bsr{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))]>, + PS, OpSize16, Sched<[WriteBSRLd]>; +def BSR32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "bsr{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))]>, + PS, OpSize32, Sched<[WriteBSR]>; +def BSR32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "bsr{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))]>, + PS, OpSize32, Sched<[WriteBSRLd]>; +def BSR64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), + "bsr{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))]>, + PS, Sched<[WriteBSR]>; +def BSR64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "bsr{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))]>, + PS, Sched<[WriteBSRLd]>; +} // Defs = [EFLAGS] + +let SchedRW = [WriteMicrocoded] in { +let Defs = [EDI,ESI], Uses = [EDI,ESI,DF] in { +def MOVSB : I<0xA4, RawFrmDstSrc, (outs), (ins dstidx8:$dst, srcidx8:$src), + "movsb\t{$src, $dst|$dst, $src}", []>; +def MOVSW : I<0xA5, RawFrmDstSrc, (outs), (ins dstidx16:$dst, srcidx16:$src), + "movsw\t{$src, $dst|$dst, $src}", []>, OpSize16; +def MOVSL : I<0xA5, RawFrmDstSrc, (outs), (ins dstidx32:$dst, srcidx32:$src), + "movs{l|d}\t{$src, $dst|$dst, $src}", []>, OpSize32; +def MOVSQ : RI<0xA5, RawFrmDstSrc, (outs), (ins dstidx64:$dst, srcidx64:$src), + "movsq\t{$src, $dst|$dst, $src}", []>, + Requires<[In64BitMode]>; +} + +let Defs = [EDI], Uses = [AL,EDI,DF] in +def STOSB : I<0xAA, RawFrmDst, (outs), (ins dstidx8:$dst), + "stosb\t{%al, $dst|$dst, al}", []>; +let Defs = [EDI], Uses = [AX,EDI,DF] in +def STOSW : I<0xAB, RawFrmDst, (outs), (ins dstidx16:$dst), + "stosw\t{%ax, $dst|$dst, ax}", []>, OpSize16; +let Defs = [EDI], Uses = [EAX,EDI,DF] in +def STOSL : I<0xAB, RawFrmDst, (outs), (ins dstidx32:$dst), + "stos{l|d}\t{%eax, $dst|$dst, eax}", []>, OpSize32; +let Defs = [RDI], Uses = [RAX,RDI,DF] in +def STOSQ : RI<0xAB, RawFrmDst, (outs), (ins dstidx64:$dst), + "stosq\t{%rax, $dst|$dst, rax}", []>, + Requires<[In64BitMode]>; + +let Defs = [EDI,EFLAGS], Uses = [AL,EDI,DF] in +def SCASB : I<0xAE, RawFrmDst, (outs), (ins dstidx8:$dst), + "scasb\t{$dst, %al|al, $dst}", []>; +let Defs = [EDI,EFLAGS], Uses = [AX,EDI,DF] in +def SCASW : I<0xAF, RawFrmDst, (outs), (ins dstidx16:$dst), + "scasw\t{$dst, %ax|ax, $dst}", []>, OpSize16; +let Defs = [EDI,EFLAGS], Uses = [EAX,EDI,DF] in +def SCASL : I<0xAF, RawFrmDst, (outs), (ins dstidx32:$dst), + "scas{l|d}\t{$dst, %eax|eax, $dst}", []>, OpSize32; +let Defs = [EDI,EFLAGS], Uses = [RAX,EDI,DF] in +def SCASQ : RI<0xAF, RawFrmDst, (outs), (ins dstidx64:$dst), + "scasq\t{$dst, %rax|rax, $dst}", []>, + Requires<[In64BitMode]>; + +let Defs = [EDI,ESI,EFLAGS], Uses = [EDI,ESI,DF] in { +def CMPSB : I<0xA6, RawFrmDstSrc, (outs), (ins dstidx8:$dst, srcidx8:$src), + "cmpsb\t{$dst, $src|$src, $dst}", []>; +def CMPSW : I<0xA7, RawFrmDstSrc, (outs), (ins dstidx16:$dst, srcidx16:$src), + "cmpsw\t{$dst, $src|$src, $dst}", []>, OpSize16; +def CMPSL : I<0xA7, RawFrmDstSrc, (outs), (ins dstidx32:$dst, srcidx32:$src), + "cmps{l|d}\t{$dst, $src|$src, $dst}", []>, OpSize32; +def CMPSQ : RI<0xA7, RawFrmDstSrc, (outs), (ins dstidx64:$dst, srcidx64:$src), + "cmpsq\t{$dst, $src|$src, $dst}", []>, + Requires<[In64BitMode]>; +} +} // SchedRW + +//===----------------------------------------------------------------------===// +// Move Instructions. +// +let SchedRW = [WriteMove] in { +let hasSideEffects = 0, isMoveReg = 1 in { +def MOV8rr : I<0x88, MRMDestReg, (outs GR8 :$dst), (ins GR8 :$src), + "mov{b}\t{$src, $dst|$dst, $src}", []>; +def MOV16rr : I<0x89, MRMDestReg, (outs GR16:$dst), (ins GR16:$src), + "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize16; +def MOV32rr : I<0x89, MRMDestReg, (outs GR32:$dst), (ins GR32:$src), + "mov{l}\t{$src, $dst|$dst, $src}", []>, OpSize32; +def MOV64rr : RI<0x89, MRMDestReg, (outs GR64:$dst), (ins GR64:$src), + "mov{q}\t{$src, $dst|$dst, $src}", []>; +} + +let isReMaterializable = 1, isAsCheapAsAMove = 1 in { +def MOV8ri : Ii8 <0xB0, AddRegFrm, (outs GR8 :$dst), (ins i8imm :$src), + "mov{b}\t{$src, $dst|$dst, $src}", + [(set GR8:$dst, imm:$src)]>; +def MOV16ri : Ii16<0xB8, AddRegFrm, (outs GR16:$dst), (ins i16imm:$src), + "mov{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, imm:$src)]>, OpSize16; +def MOV32ri : Ii32<0xB8, AddRegFrm, (outs GR32:$dst), (ins i32imm:$src), + "mov{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, relocImm:$src)]>, OpSize32; +def MOV64ri32 : RIi32S<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src), + "mov{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, i64immSExt32:$src)]>; +} +let isReMaterializable = 1 in { +def MOV64ri : RIi64<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64imm:$src), + "movabs{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, relocImm:$src)]>; +} + +// Longer forms that use a ModR/M byte. Needed for disassembler +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { +def MOV8ri_alt : Ii8 <0xC6, MRM0r, (outs GR8 :$dst), (ins i8imm :$src), + "mov{b}\t{$src, $dst|$dst, $src}", []>, + FoldGenData<"MOV8ri">; +def MOV16ri_alt : Ii16<0xC7, MRM0r, (outs GR16:$dst), (ins i16imm:$src), + "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize16, + FoldGenData<"MOV16ri">; +def MOV32ri_alt : Ii32<0xC7, MRM0r, (outs GR32:$dst), (ins i32imm:$src), + "mov{l}\t{$src, $dst|$dst, $src}", []>, OpSize32, + FoldGenData<"MOV32ri">; +} +} // SchedRW + +let SchedRW = [WriteStore] in { +def MOV8mi : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src), + "mov{b}\t{$src, $dst|$dst, $src}", + [(store (i8 imm8_su:$src), addr:$dst)]>; +def MOV16mi : Ii16<0xC7, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src), + "mov{w}\t{$src, $dst|$dst, $src}", + [(store (i16 imm16_su:$src), addr:$dst)]>, OpSize16; +def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src), + "mov{l}\t{$src, $dst|$dst, $src}", + [(store (i32 imm32_su:$src), addr:$dst)]>, OpSize32; +def MOV64mi32 : RIi32S<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src), + "mov{q}\t{$src, $dst|$dst, $src}", + [(store i64immSExt32_su:$src, addr:$dst)]>, + Requires<[In64BitMode]>; +} // SchedRW + +let hasSideEffects = 0 in { + +/// Memory offset versions of moves. The immediate is an address mode sized +/// offset from the segment base. +let SchedRW = [WriteALU] in { +let mayLoad = 1 in { +let Defs = [AL] in +def MOV8ao32 : Ii32<0xA0, RawFrmMemOffs, (outs), (ins offset32_8:$src), + "mov{b}\t{$src, %al|al, $src}", []>, + AdSize32; +let Defs = [AX] in +def MOV16ao32 : Ii32<0xA1, RawFrmMemOffs, (outs), (ins offset32_16:$src), + "mov{w}\t{$src, %ax|ax, $src}", []>, + OpSize16, AdSize32; +let Defs = [EAX] in +def MOV32ao32 : Ii32<0xA1, RawFrmMemOffs, (outs), (ins offset32_32:$src), + "mov{l}\t{$src, %eax|eax, $src}", []>, + OpSize32, AdSize32; +let Defs = [RAX] in +def MOV64ao32 : RIi32<0xA1, RawFrmMemOffs, (outs), (ins offset32_64:$src), + "mov{q}\t{$src, %rax|rax, $src}", []>, + AdSize32; + +let Defs = [AL] in +def MOV8ao16 : Ii16<0xA0, RawFrmMemOffs, (outs), (ins offset16_8:$src), + "mov{b}\t{$src, %al|al, $src}", []>, AdSize16; +let Defs = [AX] in +def MOV16ao16 : Ii16<0xA1, RawFrmMemOffs, (outs), (ins offset16_16:$src), + "mov{w}\t{$src, %ax|ax, $src}", []>, + OpSize16, AdSize16; +let Defs = [EAX] in +def MOV32ao16 : Ii16<0xA1, RawFrmMemOffs, (outs), (ins offset16_32:$src), + "mov{l}\t{$src, %eax|eax, $src}", []>, + AdSize16, OpSize32; +} // mayLoad +let mayStore = 1 in { +let Uses = [AL] in +def MOV8o32a : Ii32<0xA2, RawFrmMemOffs, (outs), (ins offset32_8:$dst), + "mov{b}\t{%al, $dst|$dst, al}", []>, AdSize32; +let Uses = [AX] in +def MOV16o32a : Ii32<0xA3, RawFrmMemOffs, (outs), (ins offset32_16:$dst), + "mov{w}\t{%ax, $dst|$dst, ax}", []>, + OpSize16, AdSize32; +let Uses = [EAX] in +def MOV32o32a : Ii32<0xA3, RawFrmMemOffs, (outs), (ins offset32_32:$dst), + "mov{l}\t{%eax, $dst|$dst, eax}", []>, + OpSize32, AdSize32; +let Uses = [RAX] in +def MOV64o32a : RIi32<0xA3, RawFrmMemOffs, (outs), (ins offset32_64:$dst), + "mov{q}\t{%rax, $dst|$dst, rax}", []>, + AdSize32; + +let Uses = [AL] in +def MOV8o16a : Ii16<0xA2, RawFrmMemOffs, (outs), (ins offset16_8:$dst), + "mov{b}\t{%al, $dst|$dst, al}", []>, AdSize16; +let Uses = [AX] in +def MOV16o16a : Ii16<0xA3, RawFrmMemOffs, (outs), (ins offset16_16:$dst), + "mov{w}\t{%ax, $dst|$dst, ax}", []>, + OpSize16, AdSize16; +let Uses = [EAX] in +def MOV32o16a : Ii16<0xA3, RawFrmMemOffs, (outs), (ins offset16_32:$dst), + "mov{l}\t{%eax, $dst|$dst, eax}", []>, + OpSize32, AdSize16; +} // mayStore + +// These forms all have full 64-bit absolute addresses in their instructions +// and use the movabs mnemonic to indicate this specific form. +let mayLoad = 1 in { +let Defs = [AL] in +def MOV8ao64 : Ii64<0xA0, RawFrmMemOffs, (outs), (ins offset64_8:$src), + "movabs{b}\t{$src, %al|al, $src}", []>, + AdSize64; +let Defs = [AX] in +def MOV16ao64 : Ii64<0xA1, RawFrmMemOffs, (outs), (ins offset64_16:$src), + "movabs{w}\t{$src, %ax|ax, $src}", []>, + OpSize16, AdSize64; +let Defs = [EAX] in +def MOV32ao64 : Ii64<0xA1, RawFrmMemOffs, (outs), (ins offset64_32:$src), + "movabs{l}\t{$src, %eax|eax, $src}", []>, + OpSize32, AdSize64; +let Defs = [RAX] in +def MOV64ao64 : RIi64<0xA1, RawFrmMemOffs, (outs), (ins offset64_64:$src), + "movabs{q}\t{$src, %rax|rax, $src}", []>, + AdSize64; +} // mayLoad + +let mayStore = 1 in { +let Uses = [AL] in +def MOV8o64a : Ii64<0xA2, RawFrmMemOffs, (outs), (ins offset64_8:$dst), + "movabs{b}\t{%al, $dst|$dst, al}", []>, + AdSize64; +let Uses = [AX] in +def MOV16o64a : Ii64<0xA3, RawFrmMemOffs, (outs), (ins offset64_16:$dst), + "movabs{w}\t{%ax, $dst|$dst, ax}", []>, + OpSize16, AdSize64; +let Uses = [EAX] in +def MOV32o64a : Ii64<0xA3, RawFrmMemOffs, (outs), (ins offset64_32:$dst), + "movabs{l}\t{%eax, $dst|$dst, eax}", []>, + OpSize32, AdSize64; +let Uses = [RAX] in +def MOV64o64a : RIi64<0xA3, RawFrmMemOffs, (outs), (ins offset64_64:$dst), + "movabs{q}\t{%rax, $dst|$dst, rax}", []>, + AdSize64; +} // mayStore +} // SchedRW +} // hasSideEffects = 0 + +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, + SchedRW = [WriteMove], isMoveReg = 1 in { +def MOV8rr_REV : I<0x8A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src), + "mov{b}\t{$src, $dst|$dst, $src}", []>, + FoldGenData<"MOV8rr">; +def MOV16rr_REV : I<0x8B, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), + "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize16, + FoldGenData<"MOV16rr">; +def MOV32rr_REV : I<0x8B, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "mov{l}\t{$src, $dst|$dst, $src}", []>, OpSize32, + FoldGenData<"MOV32rr">; +def MOV64rr_REV : RI<0x8B, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), + "mov{q}\t{$src, $dst|$dst, $src}", []>, + FoldGenData<"MOV64rr">; +} + +// Reversed version with ".s" suffix for GAS compatibility. +//def : InstAlias<"mov{b}.s\t{$src, $dst|$dst, $src}", +// (MOV8rr_REV GR8:$dst, GR8:$src), 0>; +//def : InstAlias<"mov{w}.s\t{$src, $dst|$dst, $src}", +// (MOV16rr_REV GR16:$dst, GR16:$src), 0>; +//def : InstAlias<"mov{l}.s\t{$src, $dst|$dst, $src}", +// (MOV32rr_REV GR32:$dst, GR32:$src), 0>; +//def : InstAlias<"mov{q}.s\t{$src, $dst|$dst, $src}", +// (MOV64rr_REV GR64:$dst, GR64:$src), 0>; +//def : InstAlias<"mov.s\t{$src, $dst|$dst, $src}", +// (MOV8rr_REV GR8:$dst, GR8:$src), 0, "att">; +//def : InstAlias<"mov.s\t{$src, $dst|$dst, $src}", +// (MOV16rr_REV GR16:$dst, GR16:$src), 0, "att">; +//def : InstAlias<"mov.s\t{$src, $dst|$dst, $src}", +// (MOV32rr_REV GR32:$dst, GR32:$src), 0, "att">; +//def : InstAlias<"mov.s\t{$src, $dst|$dst, $src}", +// (MOV64rr_REV GR64:$dst, GR64:$src), 0, "att">; + +let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in { +def MOV8rm : I<0x8A, MRMSrcMem, (outs GR8 :$dst), (ins i8mem :$src), + "mov{b}\t{$src, $dst|$dst, $src}", + [(set GR8:$dst, (loadi8 addr:$src))]>; +def MOV16rm : I<0x8B, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "mov{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, (loadi16 addr:$src))]>, OpSize16; +def MOV32rm : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "mov{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (loadi32 addr:$src))]>, OpSize32; +def MOV64rm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "mov{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (load addr:$src))]>; +} + +let SchedRW = [WriteStore] in { +def MOV8mr : I<0x88, MRMDestMem, (outs), (ins i8mem :$dst, GR8 :$src), + "mov{b}\t{$src, $dst|$dst, $src}", + [(store GR8:$src, addr:$dst)]>; +def MOV16mr : I<0x89, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src), + "mov{w}\t{$src, $dst|$dst, $src}", + [(store GR16:$src, addr:$dst)]>, OpSize16; +def MOV32mr : I<0x89, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), + "mov{l}\t{$src, $dst|$dst, $src}", + [(store GR32:$src, addr:$dst)]>, OpSize32; +def MOV64mr : RI<0x89, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), + "mov{q}\t{$src, $dst|$dst, $src}", + [(store GR64:$src, addr:$dst)]>; +} // SchedRW + +// Versions of MOV8rr, MOV8mr, and MOV8rm that use i8mem_NOREX and GR8_NOREX so +// that they can be used for copying and storing h registers, which can't be +// encoded when a REX prefix is present. +let isCodeGenOnly = 1 in { +let hasSideEffects = 0, isMoveReg = 1 in +def MOV8rr_NOREX : I<0x88, MRMDestReg, + (outs GR8_NOREX:$dst), (ins GR8_NOREX:$src), + "mov{b}\t{$src, $dst|$dst, $src}", []>, + Sched<[WriteMove]>; +let mayStore = 1, hasSideEffects = 0 in +def MOV8mr_NOREX : I<0x88, MRMDestMem, + (outs), (ins i8mem_NOREX:$dst, GR8_NOREX:$src), + "mov{b}\t{$src, $dst|$dst, $src}", []>, + Sched<[WriteStore]>; +let mayLoad = 1, hasSideEffects = 0, + canFoldAsLoad = 1, isReMaterializable = 1 in +def MOV8rm_NOREX : I<0x8A, MRMSrcMem, + (outs GR8_NOREX:$dst), (ins i8mem_NOREX:$src), + "mov{b}\t{$src, $dst|$dst, $src}", []>, + Sched<[WriteLoad]>; +} + + +// Condition code ops, incl. set if equal/not equal/... +let SchedRW = [WriteLAHFSAHF] in { +let Defs = [EFLAGS], Uses = [AH] in +def SAHF : I<0x9E, RawFrm, (outs), (ins), "sahf", + [(set EFLAGS, (X86sahf AH))]>, + Requires<[HasLAHFSAHF]>; +let Defs = [AH], Uses = [EFLAGS], hasSideEffects = 0 in +def LAHF : I<0x9F, RawFrm, (outs), (ins), "lahf", []>, // AH = flags + Requires<[HasLAHFSAHF]>; +} // SchedRW + +//===----------------------------------------------------------------------===// +// Bit tests instructions: BT, BTS, BTR, BTC. + +let Defs = [EFLAGS] in { +let SchedRW = [WriteBitTest] in { +def BT16rr : I<0xA3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2), + "bt{w}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt GR16:$src1, GR16:$src2))]>, + OpSize16, TB, NotMemoryFoldable; +def BT32rr : I<0xA3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2), + "bt{l}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt GR32:$src1, GR32:$src2))]>, + OpSize32, TB, NotMemoryFoldable; +def BT64rr : RI<0xA3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2), + "bt{q}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt GR64:$src1, GR64:$src2))]>, TB, + NotMemoryFoldable; +} // SchedRW + +// Unlike with the register+register form, the memory+register form of the +// bt instruction does not ignore the high bits of the index. From ISel's +// perspective, this is pretty bizarre. Make these instructions disassembly +// only for now. These instructions are also slow on modern CPUs so that's +// another reason to avoid generating them. + +let mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteALULd] in { + def BT16mr : I<0xA3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), + "bt{w}\t{$src2, $src1|$src1, $src2}", + []>, OpSize16, TB, NotMemoryFoldable; + def BT32mr : I<0xA3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2), + "bt{l}\t{$src2, $src1|$src1, $src2}", + []>, OpSize32, TB, NotMemoryFoldable; + def BT64mr : RI<0xA3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), + "bt{q}\t{$src2, $src1|$src1, $src2}", + []>, TB, NotMemoryFoldable; +} + +let SchedRW = [WriteBitTest] in { +def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16i8imm:$src2), + "bt{w}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt GR16:$src1, i16immSExt8:$src2))]>, + OpSize16, TB; +def BT32ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR32:$src1, i32i8imm:$src2), + "bt{l}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt GR32:$src1, i32immSExt8:$src2))]>, + OpSize32, TB; +def BT64ri8 : RIi8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64i8imm:$src2), + "bt{q}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt GR64:$src1, i64immSExt8:$src2))]>, TB; +} // SchedRW + +// Note that these instructions aren't slow because that only applies when the +// other operand is in a register. When it's an immediate, bt is still fast. +let SchedRW = [WriteALU] in { +def BT16mi8 : Ii8<0xBA, MRM4m, (outs), (ins i16mem:$src1, i16i8imm:$src2), + "bt{w}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt (loadi16 addr:$src1), + i16immSExt8:$src2))]>, + OpSize16, TB; +def BT32mi8 : Ii8<0xBA, MRM4m, (outs), (ins i32mem:$src1, i32i8imm:$src2), + "bt{l}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt (loadi32 addr:$src1), + i32immSExt8:$src2))]>, + OpSize32, TB; +def BT64mi8 : RIi8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64i8imm:$src2), + "bt{q}\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86bt (loadi64 addr:$src1), + i64immSExt8:$src2))]>, TB, + Requires<[In64BitMode]>; +} // SchedRW + +let hasSideEffects = 0 in { +let SchedRW = [WriteBitTest], Constraints = "$src1 = $dst" in { +def BTC16rr : I<0xBB, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), + "btc{w}\t{$src2, $src1|$src1, $src2}", []>, + OpSize16, TB, NotMemoryFoldable; +def BTC32rr : I<0xBB, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), + "btc{l}\t{$src2, $src1|$src1, $src2}", []>, + OpSize32, TB, NotMemoryFoldable; +def BTC64rr : RI<0xBB, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), + "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB, + NotMemoryFoldable; +} // SchedRW + +let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { +def BTC16mr : I<0xBB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), + "btc{w}\t{$src2, $src1|$src1, $src2}", []>, + OpSize16, TB, NotMemoryFoldable; +def BTC32mr : I<0xBB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2), + "btc{l}\t{$src2, $src1|$src1, $src2}", []>, + OpSize32, TB, NotMemoryFoldable; +def BTC64mr : RI<0xBB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), + "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB, + NotMemoryFoldable; +} + +let SchedRW = [WriteBitTest], Constraints = "$src1 = $dst" in { +def BTC16ri8 : Ii8<0xBA, MRM7r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), + "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB; +def BTC32ri8 : Ii8<0xBA, MRM7r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2), + "btc{l}\t{$src2, $src1|$src1, $src2}", []>, OpSize32, TB; +def BTC64ri8 : RIi8<0xBA, MRM7r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2), + "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB; +} // SchedRW + +let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { +def BTC16mi8 : Ii8<0xBA, MRM7m, (outs), (ins i16mem:$src1, i16i8imm:$src2), + "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB; +def BTC32mi8 : Ii8<0xBA, MRM7m, (outs), (ins i32mem:$src1, i32i8imm:$src2), + "btc{l}\t{$src2, $src1|$src1, $src2}", []>, OpSize32, TB; +def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64i8imm:$src2), + "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB, + Requires<[In64BitMode]>; +} + +let SchedRW = [WriteBitTest], Constraints = "$src1 = $dst" in { +def BTR16rr : I<0xB3, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), + "btr{w}\t{$src2, $src1|$src1, $src2}", []>, + OpSize16, TB, NotMemoryFoldable; +def BTR32rr : I<0xB3, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), + "btr{l}\t{$src2, $src1|$src1, $src2}", []>, + OpSize32, TB, NotMemoryFoldable; +def BTR64rr : RI<0xB3, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), + "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB, + NotMemoryFoldable; +} // SchedRW + +let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { +def BTR16mr : I<0xB3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), + "btr{w}\t{$src2, $src1|$src1, $src2}", []>, + OpSize16, TB, NotMemoryFoldable; +def BTR32mr : I<0xB3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2), + "btr{l}\t{$src2, $src1|$src1, $src2}", []>, + OpSize32, TB, NotMemoryFoldable; +def BTR64mr : RI<0xB3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), + "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB, + NotMemoryFoldable; +} + +let SchedRW = [WriteBitTest], Constraints = "$src1 = $dst" in { +def BTR16ri8 : Ii8<0xBA, MRM6r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), + "btr{w}\t{$src2, $src1|$src1, $src2}", []>, + OpSize16, TB; +def BTR32ri8 : Ii8<0xBA, MRM6r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2), + "btr{l}\t{$src2, $src1|$src1, $src2}", []>, + OpSize32, TB; +def BTR64ri8 : RIi8<0xBA, MRM6r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2), + "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB; +} // SchedRW + +let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { +def BTR16mi8 : Ii8<0xBA, MRM6m, (outs), (ins i16mem:$src1, i16i8imm:$src2), + "btr{w}\t{$src2, $src1|$src1, $src2}", []>, + OpSize16, TB; +def BTR32mi8 : Ii8<0xBA, MRM6m, (outs), (ins i32mem:$src1, i32i8imm:$src2), + "btr{l}\t{$src2, $src1|$src1, $src2}", []>, + OpSize32, TB; +def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64i8imm:$src2), + "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB, + Requires<[In64BitMode]>; +} + +let SchedRW = [WriteBitTest], Constraints = "$src1 = $dst" in { +def BTS16rr : I<0xAB, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), + "bts{w}\t{$src2, $src1|$src1, $src2}", []>, + OpSize16, TB, NotMemoryFoldable; +def BTS32rr : I<0xAB, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), + "bts{l}\t{$src2, $src1|$src1, $src2}", []>, + OpSize32, TB, NotMemoryFoldable; +def BTS64rr : RI<0xAB, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), + "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB, + NotMemoryFoldable; +} // SchedRW + +let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { +def BTS16mr : I<0xAB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), + "bts{w}\t{$src2, $src1|$src1, $src2}", []>, + OpSize16, TB, NotMemoryFoldable; +def BTS32mr : I<0xAB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2), + "bts{l}\t{$src2, $src1|$src1, $src2}", []>, + OpSize32, TB, NotMemoryFoldable; +def BTS64mr : RI<0xAB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), + "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB, + NotMemoryFoldable; +} + +let SchedRW = [WriteBitTest], Constraints = "$src1 = $dst" in { +def BTS16ri8 : Ii8<0xBA, MRM5r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), + "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB; +def BTS32ri8 : Ii8<0xBA, MRM5r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2), + "bts{l}\t{$src2, $src1|$src1, $src2}", []>, OpSize32, TB; +def BTS64ri8 : RIi8<0xBA, MRM5r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2), + "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB; +} // SchedRW + +let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { +def BTS16mi8 : Ii8<0xBA, MRM5m, (outs), (ins i16mem:$src1, i16i8imm:$src2), + "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize16, TB; +def BTS32mi8 : Ii8<0xBA, MRM5m, (outs), (ins i32mem:$src1, i32i8imm:$src2), + "bts{l}\t{$src2, $src1|$src1, $src2}", []>, OpSize32, TB; +def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2), + "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB, + Requires<[In64BitMode]>; +} +} // hasSideEffects = 0 +} // Defs = [EFLAGS] + + +//===----------------------------------------------------------------------===// +// Atomic support +// + +// Atomic swap. These are just normal xchg instructions. But since a memory +// operand is referenced, the atomicity is ensured. +multiclass ATOMIC_SWAP<bits<8> opc8, bits<8> opc, string mnemonic, string frag> { + let Constraints = "$val = $dst", SchedRW = [WriteALULd, WriteRMW] in { + def NAME#8rm : I<opc8, MRMSrcMem, (outs GR8:$dst), + (ins GR8:$val, i8mem:$ptr), + !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"), + [(set + GR8:$dst, + (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))]>; + def NAME#16rm : I<opc, MRMSrcMem, (outs GR16:$dst), + (ins GR16:$val, i16mem:$ptr), + !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"), + [(set + GR16:$dst, + (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))]>, + OpSize16; + def NAME#32rm : I<opc, MRMSrcMem, (outs GR32:$dst), + (ins GR32:$val, i32mem:$ptr), + !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"), + [(set + GR32:$dst, + (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))]>, + OpSize32; + def NAME#64rm : RI<opc, MRMSrcMem, (outs GR64:$dst), + (ins GR64:$val, i64mem:$ptr), + !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"), + [(set + GR64:$dst, + (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))]>; + } +} + +defm XCHG : ATOMIC_SWAP<0x86, 0x87, "xchg", "atomic_swap">, NotMemoryFoldable; + +// Swap between registers. +let SchedRW = [WriteALU] in { +let Constraints = "$src1 = $dst1, $src2 = $dst2", hasSideEffects = 0 in { +def XCHG8rr : I<0x86, MRMSrcReg, (outs GR8:$dst1, GR8:$dst2), + (ins GR8:$src1, GR8:$src2), + "xchg{b}\t{$src1, $src2|$src2, $src1}", []>, NotMemoryFoldable; +def XCHG16rr : I<0x87, MRMSrcReg, (outs GR16:$dst1, GR16:$dst2), + (ins GR16:$src1, GR16:$src2), + "xchg{w}\t{$src1, $src2|$src2, $src1}", []>, + OpSize16, NotMemoryFoldable; +def XCHG32rr : I<0x87, MRMSrcReg, (outs GR32:$dst1, GR32:$dst2), + (ins GR32:$src1, GR32:$src2), + "xchg{l}\t{$src1, $src2|$src2, $src1}", []>, + OpSize32, NotMemoryFoldable; +def XCHG64rr : RI<0x87, MRMSrcReg, (outs GR64:$dst1, GR64:$dst2), + (ins GR64:$src1 ,GR64:$src2), + "xchg{q}\t{$src1, $src2|$src2, $src1}", []>, NotMemoryFoldable; +} + +def NOOP19rr: I<0x19, MRMSrcReg, (outs), (ins GR32:$val, GR32:$src), + "nop\t{$val, $src|$src, $val}", []>, TB, + OpSize32; + +// Swap between EAX and other registers. +let Constraints = "$src = $dst", hasSideEffects = 0 in { +let Uses = [AX], Defs = [AX] in +def XCHG16ar : I<0x90, AddRegFrm, (outs GR16:$dst), (ins GR16:$src), + "xchg{w}\t{%ax, $src|$src, ax}", []>, OpSize16; +let Uses = [EAX], Defs = [EAX] in +def XCHG32ar : I<0x90, AddRegFrm, (outs GR32:$dst), (ins GR32:$src), + "xchg{l}\t{%eax, $src|$src, eax}", []>, OpSize32; +let Uses = [RAX], Defs = [RAX] in +def XCHG64ar : RI<0x90, AddRegFrm, (outs GR64:$dst), (ins GR64:$src), + "xchg{q}\t{%rax, $src|$src, rax}", []>; +} +} // SchedRW + +let hasSideEffects = 0, Constraints = "$src1 = $dst1, $src2 = $dst2", + Defs = [EFLAGS], SchedRW = [WriteALU] in { +def XADD8rr : I<0xC0, MRMDestReg, (outs GR8:$dst1, GR8:$dst2), + (ins GR8:$src1, GR8:$src2), + "xadd{b}\t{$src2, $src1|$src1, $src2}", []>, TB; +def XADD16rr : I<0xC1, MRMDestReg, (outs GR16:$dst1, GR16:$dst2), + (ins GR16:$src1, GR16:$src2), + "xadd{w}\t{$src2, $src1|$src1, $src2}", []>, TB, OpSize16; +def XADD32rr : I<0xC1, MRMDestReg, (outs GR32:$dst1, GR32:$dst2), + (ins GR32:$src1, GR32:$src2), + "xadd{l}\t{$src2, $src1|$src1, $src2}", []>, TB, OpSize32; +def XADD64rr : RI<0xC1, MRMDestReg, (outs GR64:$dst1, GR64:$dst2), + (ins GR64:$src1, GR64:$src2), + "xadd{q}\t{$src2, $src1|$src1, $src2}", []>, TB; +} // SchedRW + +let mayLoad = 1, mayStore = 1, hasSideEffects = 0, Constraints = "$val = $dst", + Defs = [EFLAGS], SchedRW = [WriteALULd, WriteRMW] in { +def XADD8rm : I<0xC0, MRMSrcMem, (outs GR8:$dst), + (ins GR8:$val, i8mem:$ptr), + "xadd{b}\t{$val, $ptr|$ptr, $val}", []>, TB; +def XADD16rm : I<0xC1, MRMSrcMem, (outs GR16:$dst), + (ins GR16:$val, i16mem:$ptr), + "xadd{w}\t{$val, $ptr|$ptr, $val}", []>, TB, + OpSize16; +def XADD32rm : I<0xC1, MRMSrcMem, (outs GR32:$dst), + (ins GR32:$val, i32mem:$ptr), + "xadd{l}\t{$val, $ptr|$ptr, $val}", []>, TB, + OpSize32; +def XADD64rm : RI<0xC1, MRMSrcMem, (outs GR64:$dst), + (ins GR64:$val, i64mem:$ptr), + "xadd{q}\t{$val, $ptr|$ptr, $val}", []>, TB; + +} + +let SchedRW = [WriteALU], hasSideEffects = 0 in { +let Defs = [AL, EFLAGS], Uses = [AL] in +def CMPXCHG8rr : I<0xB0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src), + "cmpxchg{b}\t{$src, $dst|$dst, $src}", []>, TB, + NotMemoryFoldable; +let Defs = [AX, EFLAGS], Uses = [AX] in +def CMPXCHG16rr : I<0xB1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src), + "cmpxchg{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize16, + NotMemoryFoldable; +let Defs = [EAX, EFLAGS], Uses = [EAX] in +def CMPXCHG32rr : I<0xB1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src), + "cmpxchg{l}\t{$src, $dst|$dst, $src}", []>, TB, OpSize32, + NotMemoryFoldable; +let Defs = [RAX, EFLAGS], Uses = [RAX] in +def CMPXCHG64rr : RI<0xB1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src), + "cmpxchg{q}\t{$src, $dst|$dst, $src}", []>, TB, + NotMemoryFoldable; +} // SchedRW, hasSideEffects + +let SchedRW = [WriteALULd, WriteRMW], mayLoad = 1, mayStore = 1, + hasSideEffects = 0 in { +let Defs = [AL, EFLAGS], Uses = [AL] in +def CMPXCHG8rm : I<0xB0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src), + "cmpxchg{b}\t{$src, $dst|$dst, $src}", []>, TB, + NotMemoryFoldable; +let Defs = [AX, EFLAGS], Uses = [AX] in +def CMPXCHG16rm : I<0xB1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src), + "cmpxchg{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize16, + NotMemoryFoldable; +let Defs = [EAX, EFLAGS], Uses = [EAX] in +def CMPXCHG32rm : I<0xB1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), + "cmpxchg{l}\t{$src, $dst|$dst, $src}", []>, TB, OpSize32, + NotMemoryFoldable; +let Defs = [RAX, EFLAGS], Uses = [RAX] in +def CMPXCHG64rm : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), + "cmpxchg{q}\t{$src, $dst|$dst, $src}", []>, TB, + NotMemoryFoldable; + +let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX] in +def CMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$dst), + "cmpxchg8b\t$dst", []>, TB; + +let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX] in +def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst), + "cmpxchg16b\t$dst", []>, + TB, Requires<[HasCmpxchg16b, In64BitMode]>; +} // SchedRW, mayLoad, mayStore, hasSideEffects + + +// Lock instruction prefix +let SchedRW = [WriteMicrocoded] in +def LOCK_PREFIX : I<0xF0, RawFrm, (outs), (ins), "lock", []>; + +let SchedRW = [WriteNop] in { + +// Rex64 instruction prefix +def REX64_PREFIX : I<0x48, RawFrm, (outs), (ins), "rex64", []>, + Requires<[In64BitMode]>; + +// Data16 instruction prefix +def DATA16_PREFIX : I<0x66, RawFrm, (outs), (ins), "data16", []>; +} // SchedRW + +// Repeat string operation instruction prefixes +let Defs = [ECX], Uses = [ECX,DF], SchedRW = [WriteMicrocoded] in { +// Repeat (used with INS, OUTS, MOVS, LODS and STOS) +def REP_PREFIX : I<0xF3, RawFrm, (outs), (ins), "rep", []>; +// Repeat while not equal (used with CMPS and SCAS) +def REPNE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "repne", []>; +} + +// String manipulation instructions +let SchedRW = [WriteMicrocoded] in { +let Defs = [AL,ESI], Uses = [ESI,DF] in +def LODSB : I<0xAC, RawFrmSrc, (outs), (ins srcidx8:$src), + "lodsb\t{$src, %al|al, $src}", []>; +let Defs = [AX,ESI], Uses = [ESI,DF] in +def LODSW : I<0xAD, RawFrmSrc, (outs), (ins srcidx16:$src), + "lodsw\t{$src, %ax|ax, $src}", []>, OpSize16; +let Defs = [EAX,ESI], Uses = [ESI,DF] in +def LODSL : I<0xAD, RawFrmSrc, (outs), (ins srcidx32:$src), + "lods{l|d}\t{$src, %eax|eax, $src}", []>, OpSize32; +let Defs = [RAX,ESI], Uses = [ESI,DF] in +def LODSQ : RI<0xAD, RawFrmSrc, (outs), (ins srcidx64:$src), + "lodsq\t{$src, %rax|rax, $src}", []>, + Requires<[In64BitMode]>; +} + +let SchedRW = [WriteSystem] in { +let Defs = [ESI], Uses = [DX,ESI,DF] in { +def OUTSB : I<0x6E, RawFrmSrc, (outs), (ins srcidx8:$src), + "outsb\t{$src, %dx|dx, $src}", []>; +def OUTSW : I<0x6F, RawFrmSrc, (outs), (ins srcidx16:$src), + "outsw\t{$src, %dx|dx, $src}", []>, OpSize16; +def OUTSL : I<0x6F, RawFrmSrc, (outs), (ins srcidx32:$src), + "outs{l|d}\t{$src, %dx|dx, $src}", []>, OpSize32; +} + +let Defs = [EDI], Uses = [DX,EDI,DF] in { +def INSB : I<0x6C, RawFrmDst, (outs), (ins dstidx8:$dst), + "insb\t{%dx, $dst|$dst, dx}", []>; +def INSW : I<0x6D, RawFrmDst, (outs), (ins dstidx16:$dst), + "insw\t{%dx, $dst|$dst, dx}", []>, OpSize16; +def INSL : I<0x6D, RawFrmDst, (outs), (ins dstidx32:$dst), + "ins{l|d}\t{%dx, $dst|$dst, dx}", []>, OpSize32; +} +} + +// EFLAGS management instructions. +let SchedRW = [WriteALU], Defs = [EFLAGS], Uses = [EFLAGS] in { +def CLC : I<0xF8, RawFrm, (outs), (ins), "clc", []>; +def STC : I<0xF9, RawFrm, (outs), (ins), "stc", []>; +def CMC : I<0xF5, RawFrm, (outs), (ins), "cmc", []>; +} + +// DF management instructions. +let SchedRW = [WriteALU], Defs = [DF] in { +def CLD : I<0xFC, RawFrm, (outs), (ins), "cld", []>; +def STD : I<0xFD, RawFrm, (outs), (ins), "std", []>; +} + +// Table lookup instructions +let Uses = [AL,EBX], Defs = [AL], hasSideEffects = 0, mayLoad = 1 in +def XLAT : I<0xD7, RawFrm, (outs), (ins), "xlatb", []>, Sched<[WriteLoad]>; + +let SchedRW = [WriteMicrocoded] in { +// ASCII Adjust After Addition +let Uses = [AL,EFLAGS], Defs = [AX,EFLAGS], hasSideEffects = 0 in +def AAA : I<0x37, RawFrm, (outs), (ins), "aaa", []>, + Requires<[Not64BitMode]>; + +// ASCII Adjust AX Before Division +let Uses = [AX], Defs = [AX,EFLAGS], hasSideEffects = 0 in +def AAD8i8 : Ii8<0xD5, RawFrm, (outs), (ins i8imm:$src), + "aad\t$src", []>, Requires<[Not64BitMode]>; + +// ASCII Adjust AX After Multiply +let Uses = [AL], Defs = [AX,EFLAGS], hasSideEffects = 0 in +def AAM8i8 : Ii8<0xD4, RawFrm, (outs), (ins i8imm:$src), + "aam\t$src", []>, Requires<[Not64BitMode]>; + +// ASCII Adjust AL After Subtraction - sets +let Uses = [AL,EFLAGS], Defs = [AX,EFLAGS], hasSideEffects = 0 in +def AAS : I<0x3F, RawFrm, (outs), (ins), "aas", []>, + Requires<[Not64BitMode]>; + +// Decimal Adjust AL after Addition +let Uses = [AL,EFLAGS], Defs = [AL,EFLAGS], hasSideEffects = 0 in +def DAA : I<0x27, RawFrm, (outs), (ins), "daa", []>, + Requires<[Not64BitMode]>; + +// Decimal Adjust AL after Subtraction +let Uses = [AL,EFLAGS], Defs = [AL,EFLAGS], hasSideEffects = 0 in +def DAS : I<0x2F, RawFrm, (outs), (ins), "das", []>, + Requires<[Not64BitMode]>; +} // SchedRW + +let SchedRW = [WriteSystem] in { +// Check Array Index Against Bounds +// Note: "bound" does not have reversed operands in at&t syntax. +def BOUNDS16rm : I<0x62, MRMSrcMem, (outs GR16:$dst), (ins i32mem:$src), + "bound\t$dst, $src", []>, OpSize16, + Requires<[Not64BitMode]>; +def BOUNDS32rm : I<0x62, MRMSrcMem, (outs GR32:$dst), (ins i64mem:$src), + "bound\t$dst, $src", []>, OpSize32, + Requires<[Not64BitMode]>; + +// Adjust RPL Field of Segment Selector +def ARPL16rr : I<0x63, MRMDestReg, (outs GR16:$dst), (ins GR16:$src), + "arpl\t{$src, $dst|$dst, $src}", []>, + Requires<[Not64BitMode]>, NotMemoryFoldable; +let mayStore = 1 in +def ARPL16mr : I<0x63, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src), + "arpl\t{$src, $dst|$dst, $src}", []>, + Requires<[Not64BitMode]>, NotMemoryFoldable; +} // SchedRW + +//===----------------------------------------------------------------------===// +// MOVBE Instructions +// +let Predicates = [HasMOVBE] in { + let SchedRW = [WriteALULd] in { + def MOVBE16rm : I<0xF0, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "movbe{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, (bswap (loadi16 addr:$src)))]>, + OpSize16, T8PS; + def MOVBE32rm : I<0xF0, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "movbe{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (bswap (loadi32 addr:$src)))]>, + OpSize32, T8PS; + def MOVBE64rm : RI<0xF0, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "movbe{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (bswap (loadi64 addr:$src)))]>, + T8PS; + } + let SchedRW = [WriteStore] in { + def MOVBE16mr : I<0xF1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src), + "movbe{w}\t{$src, $dst|$dst, $src}", + [(store (bswap GR16:$src), addr:$dst)]>, + OpSize16, T8PS; + def MOVBE32mr : I<0xF1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), + "movbe{l}\t{$src, $dst|$dst, $src}", + [(store (bswap GR32:$src), addr:$dst)]>, + OpSize32, T8PS; + def MOVBE64mr : RI<0xF1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), + "movbe{q}\t{$src, $dst|$dst, $src}", + [(store (bswap GR64:$src), addr:$dst)]>, + T8PS; + } +} + +//===----------------------------------------------------------------------===// +// RDRAND Instruction +// +let Predicates = [HasRDRAND], Defs = [EFLAGS], SchedRW = [WriteSystem] in { + def RDRAND16r : I<0xC7, MRM6r, (outs GR16:$dst), (ins), + "rdrand{w}\t$dst", [(set GR16:$dst, EFLAGS, (X86rdrand))]>, + OpSize16, PS; + def RDRAND32r : I<0xC7, MRM6r, (outs GR32:$dst), (ins), + "rdrand{l}\t$dst", [(set GR32:$dst, EFLAGS, (X86rdrand))]>, + OpSize32, PS; + def RDRAND64r : RI<0xC7, MRM6r, (outs GR64:$dst), (ins), + "rdrand{q}\t$dst", [(set GR64:$dst, EFLAGS, (X86rdrand))]>, + PS; +} + +//===----------------------------------------------------------------------===// +// RDSEED Instruction +// +let Predicates = [HasRDSEED], Defs = [EFLAGS], SchedRW = [WriteSystem] in { + def RDSEED16r : I<0xC7, MRM7r, (outs GR16:$dst), (ins), "rdseed{w}\t$dst", + [(set GR16:$dst, EFLAGS, (X86rdseed))]>, OpSize16, PS; + def RDSEED32r : I<0xC7, MRM7r, (outs GR32:$dst), (ins), "rdseed{l}\t$dst", + [(set GR32:$dst, EFLAGS, (X86rdseed))]>, OpSize32, PS; + def RDSEED64r : RI<0xC7, MRM7r, (outs GR64:$dst), (ins), "rdseed{q}\t$dst", + [(set GR64:$dst, EFLAGS, (X86rdseed))]>, PS; +} + +//===----------------------------------------------------------------------===// +// LZCNT Instruction +// +let Predicates = [HasLZCNT], Defs = [EFLAGS] in { + def LZCNT16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), + "lzcnt{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, (ctlz GR16:$src)), (implicit EFLAGS)]>, + XS, OpSize16, Sched<[WriteLZCNT]>; + def LZCNT16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "lzcnt{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, (ctlz (loadi16 addr:$src))), + (implicit EFLAGS)]>, XS, OpSize16, Sched<[WriteLZCNTLd]>; + + def LZCNT32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "lzcnt{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (ctlz GR32:$src)), (implicit EFLAGS)]>, + XS, OpSize32, Sched<[WriteLZCNT]>; + def LZCNT32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "lzcnt{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (ctlz (loadi32 addr:$src))), + (implicit EFLAGS)]>, XS, OpSize32, Sched<[WriteLZCNTLd]>; + + def LZCNT64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), + "lzcnt{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (ctlz GR64:$src)), (implicit EFLAGS)]>, + XS, Sched<[WriteLZCNT]>; + def LZCNT64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "lzcnt{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (ctlz (loadi64 addr:$src))), + (implicit EFLAGS)]>, XS, Sched<[WriteLZCNTLd]>; +} + +//===----------------------------------------------------------------------===// +// BMI Instructions +// +let Predicates = [HasBMI], Defs = [EFLAGS] in { + def TZCNT16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), + "tzcnt{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, (cttz GR16:$src)), (implicit EFLAGS)]>, + XS, OpSize16, Sched<[WriteTZCNT]>; + def TZCNT16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "tzcnt{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, (cttz (loadi16 addr:$src))), + (implicit EFLAGS)]>, XS, OpSize16, Sched<[WriteTZCNTLd]>; + + def TZCNT32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "tzcnt{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (cttz GR32:$src)), (implicit EFLAGS)]>, + XS, OpSize32, Sched<[WriteTZCNT]>; + def TZCNT32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "tzcnt{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (cttz (loadi32 addr:$src))), + (implicit EFLAGS)]>, XS, OpSize32, Sched<[WriteTZCNTLd]>; + + def TZCNT64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), + "tzcnt{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (cttz GR64:$src)), (implicit EFLAGS)]>, + XS, Sched<[WriteTZCNT]>; + def TZCNT64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "tzcnt{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (cttz (loadi64 addr:$src))), + (implicit EFLAGS)]>, XS, Sched<[WriteTZCNTLd]>; +} + +multiclass bmi_bls<string mnemonic, Format RegMRM, Format MemMRM, + RegisterClass RC, X86MemOperand x86memop> { +let hasSideEffects = 0 in { + def rr : I<0xF3, RegMRM, (outs RC:$dst), (ins RC:$src), + !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), []>, + T8PS, VEX_4V, Sched<[WriteALU]>; + let mayLoad = 1 in + def rm : I<0xF3, MemMRM, (outs RC:$dst), (ins x86memop:$src), + !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), []>, + T8PS, VEX_4V, Sched<[WriteALULd]>; +} +} + +let Predicates = [HasBMI], Defs = [EFLAGS] in { + defm BLSR32 : bmi_bls<"blsr{l}", MRM1r, MRM1m, GR32, i32mem>; + defm BLSR64 : bmi_bls<"blsr{q}", MRM1r, MRM1m, GR64, i64mem>, VEX_W; + defm BLSMSK32 : bmi_bls<"blsmsk{l}", MRM2r, MRM2m, GR32, i32mem>; + defm BLSMSK64 : bmi_bls<"blsmsk{q}", MRM2r, MRM2m, GR64, i64mem>, VEX_W; + defm BLSI32 : bmi_bls<"blsi{l}", MRM3r, MRM3m, GR32, i32mem>; + defm BLSI64 : bmi_bls<"blsi{q}", MRM3r, MRM3m, GR64, i64mem>, VEX_W; +} + +//===----------------------------------------------------------------------===// +// Pattern fragments to auto generate BMI instructions. +//===----------------------------------------------------------------------===// + +let Predicates = [HasBMI] in { + // FIXME: patterns for the load versions are not implemented + def : Pat<(and GR32:$src, (add GR32:$src, -1)), + (BLSR32rr GR32:$src)>; + def : Pat<(and GR64:$src, (add GR64:$src, -1)), + (BLSR64rr GR64:$src)>; + + def : Pat<(xor GR32:$src, (add GR32:$src, -1)), + (BLSMSK32rr GR32:$src)>; + def : Pat<(xor GR64:$src, (add GR64:$src, -1)), + (BLSMSK64rr GR64:$src)>; + + def : Pat<(and GR32:$src, (ineg GR32:$src)), + (BLSI32rr GR32:$src)>; + def : Pat<(and GR64:$src, (ineg GR64:$src)), + (BLSI64rr GR64:$src)>; +} + +multiclass bmi_bextr<bits<8> opc, string mnemonic, RegisterClass RC, + X86MemOperand x86memop, SDNode OpNode, + PatFrag ld_frag, X86FoldableSchedWrite Sched> { + def rr : I<opc, MRMSrcReg4VOp3, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (OpNode RC:$src1, RC:$src2)), (implicit EFLAGS)]>, + T8PS, VEX, Sched<[Sched]>; + def rm : I<opc, MRMSrcMem4VOp3, (outs RC:$dst), (ins x86memop:$src1, RC:$src2), + !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (OpNode (ld_frag addr:$src1), RC:$src2)), + (implicit EFLAGS)]>, T8PS, VEX, + Sched<[Sched.Folded, + // x86memop:$src1 + ReadDefault, ReadDefault, ReadDefault, ReadDefault, + ReadDefault, + // RC:$src2 + ReadAfterLd]>; +} + +let Predicates = [HasBMI], Defs = [EFLAGS] in { + defm BEXTR32 : bmi_bextr<0xF7, "bextr{l}", GR32, i32mem, + X86bextr, loadi32, WriteBEXTR>; + defm BEXTR64 : bmi_bextr<0xF7, "bextr{q}", GR64, i64mem, + X86bextr, loadi64, WriteBEXTR>, VEX_W; +} + +multiclass bmi_bzhi<bits<8> opc, string mnemonic, RegisterClass RC, + X86MemOperand x86memop, Intrinsic Int, + PatFrag ld_frag, X86FoldableSchedWrite Sched> { + def rr : I<opc, MRMSrcReg4VOp3, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (Int RC:$src1, RC:$src2)), (implicit EFLAGS)]>, + T8PS, VEX, Sched<[Sched]>; + def rm : I<opc, MRMSrcMem4VOp3, (outs RC:$dst), (ins x86memop:$src1, RC:$src2), + !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (Int (ld_frag addr:$src1), RC:$src2)), + (implicit EFLAGS)]>, T8PS, VEX, + Sched<[Sched.Folded, + // x86memop:$src1 + ReadDefault, ReadDefault, ReadDefault, ReadDefault, + ReadDefault, + // RC:$src2 + ReadAfterLd]>; +} + +let Predicates = [HasBMI2], Defs = [EFLAGS] in { + defm BZHI32 : bmi_bzhi<0xF5, "bzhi{l}", GR32, i32mem, + int_x86_bmi_bzhi_32, loadi32, WriteBZHI>; + defm BZHI64 : bmi_bzhi<0xF5, "bzhi{q}", GR64, i64mem, + int_x86_bmi_bzhi_64, loadi64, WriteBZHI>, VEX_W; +} + +def CountTrailingOnes : SDNodeXForm<imm, [{ + // Count the trailing ones in the immediate. + return getI8Imm(countTrailingOnes(N->getZExtValue()), SDLoc(N)); +}]>; + +def BEXTRMaskXForm : SDNodeXForm<imm, [{ + unsigned Length = countTrailingOnes(N->getZExtValue()); + return getI32Imm(Length << 8, SDLoc(N)); +}]>; + +def AndMask64 : ImmLeaf<i64, [{ + return isMask_64(Imm) && !isUInt<32>(Imm); +}]>; + +// Use BEXTR for 64-bit 'and' with large immediate 'mask'. +let Predicates = [HasBMI, NoBMI2, NoTBM] in { + def : Pat<(and GR64:$src, AndMask64:$mask), + (BEXTR64rr GR64:$src, + (SUBREG_TO_REG (i64 0), + (MOV32ri (BEXTRMaskXForm imm:$mask)), sub_32bit))>; + def : Pat<(and (loadi64 addr:$src), AndMask64:$mask), + (BEXTR64rm addr:$src, + (SUBREG_TO_REG (i64 0), + (MOV32ri (BEXTRMaskXForm imm:$mask)), sub_32bit))>; +} + +// Use BZHI for 64-bit 'and' with large immediate 'mask'. +let Predicates = [HasBMI2, NoTBM] in { + def : Pat<(and GR64:$src, AndMask64:$mask), + (BZHI64rr GR64:$src, + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + (MOV8ri (CountTrailingOnes imm:$mask)), sub_8bit))>; + def : Pat<(and (loadi64 addr:$src), AndMask64:$mask), + (BZHI64rm addr:$src, + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + (MOV8ri (CountTrailingOnes imm:$mask)), sub_8bit))>; +} + +let Predicates = [HasBMI2] in { + multiclass _bmi_bzhi_pattern<dag regpattern, dag mempattern, RegisterClass RC, + ValueType VT, Instruction DstInst, + Instruction DstMemInst> { + def : Pat<regpattern, + (DstInst RC:$src, + (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>; + def : Pat<mempattern, + (DstMemInst addr:$src, + (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>; + } + + multiclass bmi_bzhi_patterns<RegisterClass RC, int bitwidth, ValueType VT, + Instruction DstInst, X86MemOperand x86memop, + Instruction DstMemInst> { + // x & ((1 << y) - 1) + defm : _bmi_bzhi_pattern<(and RC:$src, (add (shl 1, GR8:$lz), -1)), + (and (x86memop addr:$src), + (add (shl 1, GR8:$lz), -1)), + RC, VT, DstInst, DstMemInst>; + + // x & ~(-1 << y) + defm : _bmi_bzhi_pattern<(and RC:$src, (xor (shl -1, GR8:$lz), -1)), + (and (x86memop addr:$src), + (xor (shl -1, GR8:$lz), -1)), + RC, VT, DstInst, DstMemInst>; + + // x & (-1 >> (bitwidth - y)) + defm : _bmi_bzhi_pattern<(and RC:$src, (srl -1, (sub bitwidth, GR8:$lz))), + (and (x86memop addr:$src), + (srl -1, (sub bitwidth, GR8:$lz))), + RC, VT, DstInst, DstMemInst>; + + // x << (bitwidth - y) >> (bitwidth - y) + defm : _bmi_bzhi_pattern<(srl (shl RC:$src, (sub bitwidth, GR8:$lz)), + (sub bitwidth, GR8:$lz)), + (srl (shl (x86memop addr:$src), + (sub bitwidth, GR8:$lz)), + (sub bitwidth, GR8:$lz)), + RC, VT, DstInst, DstMemInst>; + } + + defm : bmi_bzhi_patterns<GR32, 32, i32, BZHI32rr, loadi32, BZHI32rm>; + defm : bmi_bzhi_patterns<GR64, 64, i64, BZHI64rr, loadi64, BZHI64rm>; + + // x & (-1 >> (32 - y)) + def : Pat<(and GR32:$src, (srl -1, (i8 (trunc (sub 32, GR32:$lz))))), + (BZHI32rr GR32:$src, GR32:$lz)>; + def : Pat<(and (loadi32 addr:$src), (srl -1, (i8 (trunc (sub 32, GR32:$lz))))), + (BZHI32rm addr:$src, GR32:$lz)>; + + // x & (-1 >> (64 - y)) + def : Pat<(and GR64:$src, (srl -1, (i8 (trunc (sub 64, GR32:$lz))))), + (BZHI64rr GR64:$src, + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>; + def : Pat<(and (loadi64 addr:$src), (srl -1, (i8 (trunc (sub 64, GR32:$lz))))), + (BZHI64rm addr:$src, + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>; + + // x << (32 - y) >> (32 - y) + def : Pat<(srl (shl GR32:$src, (i8 (trunc (sub 32, GR32:$lz)))), + (i8 (trunc (sub 32, GR32:$lz)))), + (BZHI32rr GR32:$src, GR32:$lz)>; + def : Pat<(srl (shl (loadi32 addr:$src), (i8 (trunc (sub 32, GR32:$lz)))), + (i8 (trunc (sub 32, GR32:$lz)))), + (BZHI32rm addr:$src, GR32:$lz)>; + + // x << (64 - y) >> (64 - y) + def : Pat<(srl (shl GR64:$src, (i8 (trunc (sub 64, GR32:$lz)))), + (i8 (trunc (sub 64, GR32:$lz)))), + (BZHI64rr GR64:$src, + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>; + def : Pat<(srl (shl (loadi64 addr:$src), (i8 (trunc (sub 64, GR32:$lz)))), + (i8 (trunc (sub 64, GR32:$lz)))), + (BZHI64rm addr:$src, + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>; +} // HasBMI2 + +multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC, + X86MemOperand x86memop, Intrinsic Int, + PatFrag ld_frag> { + def rr : I<0xF5, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (Int RC:$src1, RC:$src2))]>, + VEX_4V, Sched<[WriteALU]>; + def rm : I<0xF5, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2)))]>, + VEX_4V, Sched<[WriteALULd, ReadAfterLd]>; +} + +let Predicates = [HasBMI2] in { + defm PDEP32 : bmi_pdep_pext<"pdep{l}", GR32, i32mem, + int_x86_bmi_pdep_32, loadi32>, T8XD; + defm PDEP64 : bmi_pdep_pext<"pdep{q}", GR64, i64mem, + int_x86_bmi_pdep_64, loadi64>, T8XD, VEX_W; + defm PEXT32 : bmi_pdep_pext<"pext{l}", GR32, i32mem, + int_x86_bmi_pext_32, loadi32>, T8XS; + defm PEXT64 : bmi_pdep_pext<"pext{q}", GR64, i64mem, + int_x86_bmi_pext_64, loadi64>, T8XS, VEX_W; +} + +//===----------------------------------------------------------------------===// +// TBM Instructions +// +let Predicates = [HasTBM], Defs = [EFLAGS] in { + +multiclass tbm_ternary_imm<bits<8> opc, RegisterClass RC, string OpcodeStr, + X86MemOperand x86memop, PatFrag ld_frag, + SDNode OpNode, Operand immtype, + SDPatternOperator immoperator, + X86FoldableSchedWrite Sched> { + def ri : Ii32<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, immtype:$cntl), + !strconcat(OpcodeStr, + "\t{$cntl, $src1, $dst|$dst, $src1, $cntl}"), + [(set RC:$dst, (OpNode RC:$src1, immoperator:$cntl))]>, + XOP, XOPA, Sched<[Sched]>; + def mi : Ii32<opc, MRMSrcMem, (outs RC:$dst), + (ins x86memop:$src1, immtype:$cntl), + !strconcat(OpcodeStr, + "\t{$cntl, $src1, $dst|$dst, $src1, $cntl}"), + [(set RC:$dst, (OpNode (ld_frag addr:$src1), immoperator:$cntl))]>, + XOP, XOPA, Sched<[Sched.Folded]>; +} + +defm BEXTRI32 : tbm_ternary_imm<0x10, GR32, "bextr{l}", i32mem, loadi32, + X86bextr, i32imm, imm, WriteBEXTR>; +let ImmT = Imm32S in +defm BEXTRI64 : tbm_ternary_imm<0x10, GR64, "bextr{q}", i64mem, loadi64, + X86bextr, i64i32imm, + i64immSExt32, WriteBEXTR>, VEX_W; + +multiclass tbm_binary_rm<bits<8> opc, Format FormReg, Format FormMem, + RegisterClass RC, string OpcodeStr, + X86MemOperand x86memop, X86FoldableSchedWrite Sched> { +let hasSideEffects = 0 in { + def rr : I<opc, FormReg, (outs RC:$dst), (ins RC:$src), + !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"), []>, + XOP_4V, XOP9, Sched<[Sched]>; + let mayLoad = 1 in + def rm : I<opc, FormMem, (outs RC:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"), []>, + XOP_4V, XOP9, Sched<[Sched.Folded]>; +} +} + +multiclass tbm_binary_intr<bits<8> opc, string OpcodeStr, + X86FoldableSchedWrite Sched, + Format FormReg, Format FormMem> { + defm NAME#32 : tbm_binary_rm<opc, FormReg, FormMem, GR32, OpcodeStr#"{l}", + i32mem, Sched>; + defm NAME#64 : tbm_binary_rm<opc, FormReg, FormMem, GR64, OpcodeStr#"{q}", + i64mem, Sched>, VEX_W; +} + +defm BLCFILL : tbm_binary_intr<0x01, "blcfill", WriteALU, MRM1r, MRM1m>; +defm BLCI : tbm_binary_intr<0x02, "blci", WriteALU, MRM6r, MRM6m>; +defm BLCIC : tbm_binary_intr<0x01, "blcic", WriteALU, MRM5r, MRM5m>; +defm BLCMSK : tbm_binary_intr<0x02, "blcmsk", WriteALU, MRM1r, MRM1m>; +defm BLCS : tbm_binary_intr<0x01, "blcs", WriteALU, MRM3r, MRM3m>; +defm BLSFILL : tbm_binary_intr<0x01, "blsfill", WriteALU, MRM2r, MRM2m>; +defm BLSIC : tbm_binary_intr<0x01, "blsic", WriteALU, MRM6r, MRM6m>; +defm T1MSKC : tbm_binary_intr<0x01, "t1mskc", WriteALU, MRM7r, MRM7m>; +defm TZMSK : tbm_binary_intr<0x01, "tzmsk", WriteALU, MRM4r, MRM4m>; +} // HasTBM, EFLAGS + +// Use BEXTRI for 64-bit 'and' with large immediate 'mask'. +let Predicates = [HasTBM] in { + def : Pat<(and GR64:$src, AndMask64:$mask), + (BEXTRI64ri GR64:$src, (BEXTRMaskXForm imm:$mask))>; + + def : Pat<(and (loadi64 addr:$src), AndMask64:$mask), + (BEXTRI64mi addr:$src, (BEXTRMaskXForm imm:$mask))>; +} + +//===----------------------------------------------------------------------===// +// Lightweight Profiling Instructions + +let Predicates = [HasLWP], SchedRW = [WriteSystem] in { + +def LLWPCB : I<0x12, MRM0r, (outs), (ins GR32:$src), "llwpcb\t$src", + [(int_x86_llwpcb GR32:$src)]>, XOP, XOP9; +def SLWPCB : I<0x12, MRM1r, (outs GR32:$dst), (ins), "slwpcb\t$dst", + [(set GR32:$dst, (int_x86_slwpcb))]>, XOP, XOP9; + +def LLWPCB64 : I<0x12, MRM0r, (outs), (ins GR64:$src), "llwpcb\t$src", + [(int_x86_llwpcb GR64:$src)]>, XOP, XOP9, VEX_W; +def SLWPCB64 : I<0x12, MRM1r, (outs GR64:$dst), (ins), "slwpcb\t$dst", + [(set GR64:$dst, (int_x86_slwpcb))]>, XOP, XOP9, VEX_W; + +multiclass lwpins_intr<RegisterClass RC> { + def rri : Ii32<0x12, MRM0r, (outs), (ins RC:$src0, GR32:$src1, i32imm:$cntl), + "lwpins\t{$cntl, $src1, $src0|$src0, $src1, $cntl}", + [(set EFLAGS, (X86lwpins RC:$src0, GR32:$src1, imm:$cntl))]>, + XOP_4V, XOPA; + let mayLoad = 1 in + def rmi : Ii32<0x12, MRM0m, (outs), (ins RC:$src0, i32mem:$src1, i32imm:$cntl), + "lwpins\t{$cntl, $src1, $src0|$src0, $src1, $cntl}", + [(set EFLAGS, (X86lwpins RC:$src0, (loadi32 addr:$src1), imm:$cntl))]>, + XOP_4V, XOPA; +} + +let Defs = [EFLAGS] in { + defm LWPINS32 : lwpins_intr<GR32>; + defm LWPINS64 : lwpins_intr<GR64>, VEX_W; +} // EFLAGS + +multiclass lwpval_intr<RegisterClass RC, Intrinsic Int> { + def rri : Ii32<0x12, MRM1r, (outs), (ins RC:$src0, GR32:$src1, i32imm:$cntl), + "lwpval\t{$cntl, $src1, $src0|$src0, $src1, $cntl}", + [(Int RC:$src0, GR32:$src1, imm:$cntl)]>, XOP_4V, XOPA; + let mayLoad = 1 in + def rmi : Ii32<0x12, MRM1m, (outs), (ins RC:$src0, i32mem:$src1, i32imm:$cntl), + "lwpval\t{$cntl, $src1, $src0|$src0, $src1, $cntl}", + [(Int RC:$src0, (loadi32 addr:$src1), imm:$cntl)]>, + XOP_4V, XOPA; +} + +defm LWPVAL32 : lwpval_intr<GR32, int_x86_lwpval32>; +defm LWPVAL64 : lwpval_intr<GR64, int_x86_lwpval64>, VEX_W; + +} // HasLWP, SchedRW + +//===----------------------------------------------------------------------===// +// MONITORX/MWAITX Instructions +// +let SchedRW = [ WriteSystem ] in { + let usesCustomInserter = 1 in { + def MONITORX : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3), + [(int_x86_monitorx addr:$src1, GR32:$src2, GR32:$src3)]>, + Requires<[ HasMWAITX ]>; + } + + let Uses = [ EAX, ECX, EDX ] in { + def MONITORXrrr : I<0x01, MRM_FA, (outs), (ins), "monitorx", []>, + TB, Requires<[ HasMWAITX ]>; + } + + let Uses = [ ECX, EAX, EBX ] in { + def MWAITXrrr : I<0x01, MRM_FB, (outs), (ins), "mwaitx", + [(int_x86_mwaitx ECX, EAX, EBX)]>, + TB, Requires<[ HasMWAITX ]>; + } +} // SchedRW + +def : InstAlias<"mwaitx\t{%eax, %ecx, %ebx|ebx, ecx, eax}", (MWAITXrrr)>, + Requires<[ Not64BitMode ]>; +def : InstAlias<"mwaitx\t{%rax, %rcx, %rbx|rbx, rcx, rax}", (MWAITXrrr)>, + Requires<[ In64BitMode ]>; + +def : InstAlias<"monitorx\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORXrrr)>, + Requires<[ Not64BitMode ]>; +def : InstAlias<"monitorx\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORXrrr)>, + Requires<[ In64BitMode ]>; + +//===----------------------------------------------------------------------===// +// WAITPKG Instructions +// +let SchedRW = [WriteSystem] in { + def UMONITOR16 : I<0xAE, MRM6r, (outs), (ins GR16:$src), + "umonitor\t$src", [(int_x86_umonitor GR16:$src)]>, + XS, AdSize16, Requires<[HasWAITPKG, Not64BitMode]>; + def UMONITOR32 : I<0xAE, MRM6r, (outs), (ins GR32:$src), + "umonitor\t$src", [(int_x86_umonitor GR32:$src)]>, + XS, AdSize32, Requires<[HasWAITPKG]>; + def UMONITOR64 : I<0xAE, MRM6r, (outs), (ins GR64:$src), + "umonitor\t$src", [(int_x86_umonitor GR64:$src)]>, + XS, AdSize64, Requires<[HasWAITPKG, In64BitMode]>; + let Uses = [EAX, EDX], Defs = [EFLAGS] in { + def UMWAIT : I<0xAE, MRM6r, + (outs), (ins GR32orGR64:$src), "umwait\t$src", + [(set EFLAGS, (X86umwait GR32orGR64:$src, EDX, EAX))]>, + XD, Requires<[HasWAITPKG]>; + def TPAUSE : I<0xAE, MRM6r, + (outs), (ins GR32orGR64:$src), "tpause\t$src", + [(set EFLAGS, (X86tpause GR32orGR64:$src, EDX, EAX))]>, + PD, Requires<[HasWAITPKG]>, NotMemoryFoldable; + } +} // SchedRW + +//===----------------------------------------------------------------------===// +// MOVDIRI - Move doubleword/quadword as direct store +// +let SchedRW = [WriteStore] in { +def MOVDIRI32 : I<0xF9, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), + "movdiri\t{$src, $dst|$dst, $src}", + [(int_x86_directstore32 addr:$dst, GR32:$src)]>, + T8, Requires<[HasMOVDIRI]>; +def MOVDIRI64 : RI<0xF9, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), + "movdiri\t{$src, $dst|$dst, $src}", + [(int_x86_directstore64 addr:$dst, GR64:$src)]>, + T8, Requires<[In64BitMode, HasMOVDIRI]>; +} // SchedRW + +//===----------------------------------------------------------------------===// +// MOVDIR64B - Move 64 bytes as direct store +// +let SchedRW = [WriteStore] in { +def MOVDIR64B16 : I<0xF8, MRMSrcMem, (outs), (ins GR16:$dst, i512mem:$src), + "movdir64b\t{$src, $dst|$dst, $src}", []>, + T8PD, AdSize16, Requires<[HasMOVDIR64B, Not64BitMode]>; +def MOVDIR64B32 : I<0xF8, MRMSrcMem, (outs), (ins GR32:$dst, i512mem:$src), + "movdir64b\t{$src, $dst|$dst, $src}", + [(int_x86_movdir64b GR32:$dst, addr:$src)]>, + T8PD, AdSize32, Requires<[HasMOVDIR64B]>; +def MOVDIR64B64 : I<0xF8, MRMSrcMem, (outs), (ins GR64:$dst, i512mem:$src), + "movdir64b\t{$src, $dst|$dst, $src}", + [(int_x86_movdir64b GR64:$dst, addr:$src)]>, + T8PD, AdSize64, Requires<[HasMOVDIR64B, In64BitMode]>; +} // SchedRW + +//===----------------------------------------------------------------------===// +// CLZERO Instruction +// +let SchedRW = [WriteSystem] in { + let Uses = [EAX] in + def CLZEROr : I<0x01, MRM_FC, (outs), (ins), "clzero", []>, + TB, Requires<[HasCLZERO]>; + + let usesCustomInserter = 1 in { + def CLZERO : PseudoI<(outs), (ins i32mem:$src1), + [(int_x86_clzero addr:$src1)]>, Requires<[HasCLZERO]>; + } +} // SchedRW + +def : InstAlias<"clzero\t{%eax|eax}", (CLZEROr)>, Requires<[Not64BitMode]>; +def : InstAlias<"clzero\t{%rax|rax}", (CLZEROr)>, Requires<[In64BitMode]>; + +//===----------------------------------------------------------------------===// +// Pattern fragments to auto generate TBM instructions. +//===----------------------------------------------------------------------===// + +let Predicates = [HasTBM] in { + // FIXME: patterns for the load versions are not implemented + def : Pat<(and GR32:$src, (add GR32:$src, 1)), + (BLCFILL32rr GR32:$src)>; + def : Pat<(and GR64:$src, (add GR64:$src, 1)), + (BLCFILL64rr GR64:$src)>; + + def : Pat<(or GR32:$src, (not (add GR32:$src, 1))), + (BLCI32rr GR32:$src)>; + def : Pat<(or GR64:$src, (not (add GR64:$src, 1))), + (BLCI64rr GR64:$src)>; + + // Extra patterns because opt can optimize the above patterns to this. + def : Pat<(or GR32:$src, (sub -2, GR32:$src)), + (BLCI32rr GR32:$src)>; + def : Pat<(or GR64:$src, (sub -2, GR64:$src)), + (BLCI64rr GR64:$src)>; + + def : Pat<(and (not GR32:$src), (add GR32:$src, 1)), + (BLCIC32rr GR32:$src)>; + def : Pat<(and (not GR64:$src), (add GR64:$src, 1)), + (BLCIC64rr GR64:$src)>; + + def : Pat<(xor GR32:$src, (add GR32:$src, 1)), + (BLCMSK32rr GR32:$src)>; + def : Pat<(xor GR64:$src, (add GR64:$src, 1)), + (BLCMSK64rr GR64:$src)>; + + def : Pat<(or GR32:$src, (add GR32:$src, 1)), + (BLCS32rr GR32:$src)>; + def : Pat<(or GR64:$src, (add GR64:$src, 1)), + (BLCS64rr GR64:$src)>; + + def : Pat<(or GR32:$src, (add GR32:$src, -1)), + (BLSFILL32rr GR32:$src)>; + def : Pat<(or GR64:$src, (add GR64:$src, -1)), + (BLSFILL64rr GR64:$src)>; + + def : Pat<(or (not GR32:$src), (add GR32:$src, -1)), + (BLSIC32rr GR32:$src)>; + def : Pat<(or (not GR64:$src), (add GR64:$src, -1)), + (BLSIC64rr GR64:$src)>; + + def : Pat<(or (not GR32:$src), (add GR32:$src, 1)), + (T1MSKC32rr GR32:$src)>; + def : Pat<(or (not GR64:$src), (add GR64:$src, 1)), + (T1MSKC64rr GR64:$src)>; + + def : Pat<(and (not GR32:$src), (add GR32:$src, -1)), + (TZMSK32rr GR32:$src)>; + def : Pat<(and (not GR64:$src), (add GR64:$src, -1)), + (TZMSK64rr GR64:$src)>; +} // HasTBM + +//===----------------------------------------------------------------------===// +// Memory Instructions +// + +let Predicates = [HasCLFLUSHOPT], SchedRW = [WriteLoad] in +def CLFLUSHOPT : I<0xAE, MRM7m, (outs), (ins i8mem:$src), + "clflushopt\t$src", [(int_x86_clflushopt addr:$src)]>, PD; + +let Predicates = [HasCLWB], SchedRW = [WriteLoad] in +def CLWB : I<0xAE, MRM6m, (outs), (ins i8mem:$src), "clwb\t$src", + [(int_x86_clwb addr:$src)]>, PD, NotMemoryFoldable; + +let Predicates = [HasCLDEMOTE], SchedRW = [WriteLoad] in +def CLDEMOTE : I<0x1C, MRM0m, (outs), (ins i8mem:$src), "cldemote\t$src", + [(int_x86_cldemote addr:$src)]>, TB; + +//===----------------------------------------------------------------------===// +// Subsystems. +//===----------------------------------------------------------------------===// + +include "X86Capstone.td" + +include "X86InstrArithmetic.td" +include "X86InstrCMovSetCC.td" +include "X86InstrExtension.td" +include "X86InstrControl.td" +include "X86InstrShiftRotate.td" + +// X87 Floating Point Stack. +//include "X86InstrFPStack.td" + +// SIMD support (SSE, MMX and AVX) +//include "X86InstrFragmentsSIMD.td" + +// FMA - Fused Multiply-Add support (requires FMA) +//include "X86InstrFMA.td" + +// XOP +//include "X86InstrXOP.td" + +// SSE, MMX and 3DNow! vector support. +//include "X86InstrSSE.td" +//include "X86InstrAVX512.td" +//include "X86InstrMMX.td" +//include "X86Instr3DNow.td" + +// MPX instructions +//include "X86InstrMPX.td" + +include "X86InstrVMX.td" +include "X86InstrSVM.td" + +//include "X86InstrTSX.td" +//include "X86InstrSGX.td" + +// System instructions. +include "X86InstrSystem.td" + +// Compiler Pseudo Instructions and Pat Patterns +//include "X86InstrCompiler.td" +//include "X86InstrVecCompiler.td" + +//===----------------------------------------------------------------------===// +// Assembler Mnemonic Aliases +//===----------------------------------------------------------------------===// + +def : MnemonicAlias<"call", "callw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"call", "calll", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"call", "callq", "att">, Requires<[In64BitMode]>; + +def : MnemonicAlias<"cbw", "cbtw", "att">; +def : MnemonicAlias<"cwde", "cwtl", "att">; +def : MnemonicAlias<"cwd", "cwtd", "att">; +def : MnemonicAlias<"cdq", "cltd", "att">; +def : MnemonicAlias<"cdqe", "cltq", "att">; +def : MnemonicAlias<"cqo", "cqto", "att">; + +// In 64-bit mode lret maps to lretl; it is not ambiguous with lretq. +def : MnemonicAlias<"lret", "lretw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"lret", "lretl", "att">, Requires<[Not16BitMode]>; + +def : MnemonicAlias<"leavel", "leave", "att">, Requires<[Not64BitMode]>; +def : MnemonicAlias<"leaveq", "leave", "att">, Requires<[In64BitMode]>; + +def : MnemonicAlias<"loopz", "loope">; +def : MnemonicAlias<"loopnz", "loopne">; + +def : MnemonicAlias<"pop", "popw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"pop", "popl", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"pop", "popq", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"popf", "popfw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"popf", "popfl", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"popf", "popfq", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"popf", "popfq", "intel">, Requires<[In64BitMode]>; +def : MnemonicAlias<"popfd", "popfl", "att">; + +// FIXME: This is wrong for "push reg". "push %bx" should turn into pushw in +// all modes. However: "push (addr)" and "push $42" should default to +// pushl/pushq depending on the current mode. Similar for "pop %bx" +def : MnemonicAlias<"push", "pushw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"push", "pushl", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"push", "pushq", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"pushf", "pushfw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"pushf", "pushfl", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"pushf", "pushfq", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"pushf", "pushfq", "intel">, Requires<[In64BitMode]>; +def : MnemonicAlias<"pushfd", "pushfl", "att">; + +def : MnemonicAlias<"popad", "popal", "intel">, Requires<[Not64BitMode]>; +def : MnemonicAlias<"pushad", "pushal", "intel">, Requires<[Not64BitMode]>; +def : MnemonicAlias<"popa", "popaw", "intel">, Requires<[In16BitMode]>; +def : MnemonicAlias<"pusha", "pushaw", "intel">, Requires<[In16BitMode]>; +def : MnemonicAlias<"popa", "popal", "intel">, Requires<[In32BitMode]>; +def : MnemonicAlias<"pusha", "pushal", "intel">, Requires<[In32BitMode]>; + +def : MnemonicAlias<"popa", "popaw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"pusha", "pushaw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"popa", "popal", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"pusha", "pushal", "att">, Requires<[In32BitMode]>; + +def : MnemonicAlias<"repe", "rep">; +def : MnemonicAlias<"repz", "rep">; +def : MnemonicAlias<"repnz", "repne">; + +def : MnemonicAlias<"ret", "retw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"ret", "retl", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"ret", "retq", "att">, Requires<[In64BitMode]>; + +// Apply 'ret' behavior to 'retn' +def : MnemonicAlias<"retn", "retw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"retn", "retl", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"retn", "retq", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"retn", "ret", "intel">; + +def : MnemonicAlias<"sal", "shl", "intel">; +def : MnemonicAlias<"salb", "shlb", "att">; +def : MnemonicAlias<"salw", "shlw", "att">; +def : MnemonicAlias<"sall", "shll", "att">; +def : MnemonicAlias<"salq", "shlq", "att">; + +def : MnemonicAlias<"smovb", "movsb", "att">; +def : MnemonicAlias<"smovw", "movsw", "att">; +def : MnemonicAlias<"smovl", "movsl", "att">; +def : MnemonicAlias<"smovq", "movsq", "att">; + +def : MnemonicAlias<"ud2a", "ud2", "att">; +def : MnemonicAlias<"verrw", "verr", "att">; + +// MS recognizes 'xacquire'/'xrelease' as 'acquire'/'release' +def : MnemonicAlias<"acquire", "xacquire", "intel">; +def : MnemonicAlias<"release", "xrelease", "intel">; + +// System instruction aliases. +def : MnemonicAlias<"iret", "iretw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"iret", "iretl", "att">, Requires<[Not16BitMode]>; +def : MnemonicAlias<"sysret", "sysretl", "att">; +def : MnemonicAlias<"sysexit", "sysexitl", "att">; + +def : MnemonicAlias<"lgdt", "lgdtw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"lgdt", "lgdtl", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"lgdt", "lgdtq", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"lidt", "lidtw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"lidt", "lidtl", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"lidt", "lidtq", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"sgdt", "sgdtw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"sgdt", "sgdtl", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"sgdt", "sgdtq", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"sidt", "sidtw", "att">, Requires<[In16BitMode]>; +def : MnemonicAlias<"sidt", "sidtl", "att">, Requires<[In32BitMode]>; +def : MnemonicAlias<"sidt", "sidtq", "att">, Requires<[In64BitMode]>; +//def : MnemonicAlias<"lgdt", "lgdtw", "intel">, Requires<[In16BitMode]>; +//def : MnemonicAlias<"lgdt", "lgdtd", "intel">, Requires<[In32BitMode]>; +//def : MnemonicAlias<"lidt", "lidtw", "intel">, Requires<[In16BitMode]>; +//def : MnemonicAlias<"lidt", "lidtd", "intel">, Requires<[In32BitMode]>; +//def : MnemonicAlias<"sgdt", "sgdtw", "intel">, Requires<[In16BitMode]>; +//def : MnemonicAlias<"sgdt", "sgdtd", "intel">, Requires<[In32BitMode]>; +//def : MnemonicAlias<"sidt", "sidtw", "intel">, Requires<[In16BitMode]>; +//def : MnemonicAlias<"sidt", "sidtd", "intel">, Requires<[In32BitMode]>; + + +// Floating point stack aliases. +def : MnemonicAlias<"fcmovz", "fcmove", "att">; +def : MnemonicAlias<"fcmova", "fcmovnbe", "att">; +def : MnemonicAlias<"fcmovnae", "fcmovb", "att">; +def : MnemonicAlias<"fcmovna", "fcmovbe", "att">; +def : MnemonicAlias<"fcmovae", "fcmovnb", "att">; +def : MnemonicAlias<"fcomip", "fcompi">; +def : MnemonicAlias<"fildq", "fildll", "att">; +def : MnemonicAlias<"fistpq", "fistpll", "att">; +def : MnemonicAlias<"fisttpq", "fisttpll", "att">; +def : MnemonicAlias<"fldcww", "fldcw", "att">; +def : MnemonicAlias<"fnstcww", "fnstcw", "att">; +def : MnemonicAlias<"fnstsww", "fnstsw", "att">; +def : MnemonicAlias<"fucomip", "fucompi">; +def : MnemonicAlias<"fwait", "wait">; + +def : MnemonicAlias<"fxsaveq", "fxsave64", "att">; +def : MnemonicAlias<"fxrstorq", "fxrstor64", "att">; +def : MnemonicAlias<"xsaveq", "xsave64", "att">; +def : MnemonicAlias<"xrstorq", "xrstor64", "att">; +def : MnemonicAlias<"xsaveoptq", "xsaveopt64", "att">; +def : MnemonicAlias<"xrstorsq", "xrstors64", "att">; +def : MnemonicAlias<"xsavecq", "xsavec64", "att">; +def : MnemonicAlias<"xsavesq", "xsaves64", "att">; + +class CondCodeAlias<string Prefix,string Suffix, string OldCond, string NewCond, + string VariantName> + : MnemonicAlias<!strconcat(Prefix, OldCond, Suffix), + !strconcat(Prefix, NewCond, Suffix), VariantName>; + +/// IntegerCondCodeMnemonicAlias - This multiclass defines a bunch of +/// MnemonicAlias's that canonicalize the condition code in a mnemonic, for +/// example "setz" -> "sete". +multiclass IntegerCondCodeMnemonicAlias<string Prefix, string Suffix, + string V = ""> { + def C : CondCodeAlias<Prefix, Suffix, "c", "b", V>; // setc -> setb + def Z : CondCodeAlias<Prefix, Suffix, "z" , "e", V>; // setz -> sete + def NA : CondCodeAlias<Prefix, Suffix, "na", "be", V>; // setna -> setbe + def NB : CondCodeAlias<Prefix, Suffix, "nb", "ae", V>; // setnb -> setae + def NC : CondCodeAlias<Prefix, Suffix, "nc", "ae", V>; // setnc -> setae + def NG : CondCodeAlias<Prefix, Suffix, "ng", "le", V>; // setng -> setle + def NL : CondCodeAlias<Prefix, Suffix, "nl", "ge", V>; // setnl -> setge + def NZ : CondCodeAlias<Prefix, Suffix, "nz", "ne", V>; // setnz -> setne + def PE : CondCodeAlias<Prefix, Suffix, "pe", "p", V>; // setpe -> setp + def PO : CondCodeAlias<Prefix, Suffix, "po", "np", V>; // setpo -> setnp + + def NAE : CondCodeAlias<Prefix, Suffix, "nae", "b", V>; // setnae -> setb + def NBE : CondCodeAlias<Prefix, Suffix, "nbe", "a", V>; // setnbe -> seta + def NGE : CondCodeAlias<Prefix, Suffix, "nge", "l", V>; // setnge -> setl + def NLE : CondCodeAlias<Prefix, Suffix, "nle", "g", V>; // setnle -> setg +} + +// Aliases for set<CC> +defm : IntegerCondCodeMnemonicAlias<"set", "">; +// Aliases for j<CC> +defm : IntegerCondCodeMnemonicAlias<"j", "">; +// Aliases for cmov<CC>{w,l,q} +defm : IntegerCondCodeMnemonicAlias<"cmov", "w", "att">; +defm : IntegerCondCodeMnemonicAlias<"cmov", "l", "att">; +defm : IntegerCondCodeMnemonicAlias<"cmov", "q", "att">; +// No size suffix for intel-style asm. +defm : IntegerCondCodeMnemonicAlias<"cmov", "", "intel">; + + +//===----------------------------------------------------------------------===// +// Assembler Instruction Aliases +//===----------------------------------------------------------------------===// + +// aad/aam default to base 10 if no operand is specified. +def : InstAlias<"aad", (AAD8i8 10)>, Requires<[Not64BitMode]>; +def : InstAlias<"aam", (AAM8i8 10)>, Requires<[Not64BitMode]>; + +// Disambiguate the mem/imm form of bt-without-a-suffix as btl. +// Likewise for btc/btr/bts. +def : InstAlias<"bt\t{$imm, $mem|$mem, $imm}", + (BT32mi8 i32mem:$mem, i32i8imm:$imm), 0, "att">; +def : InstAlias<"btc\t{$imm, $mem|$mem, $imm}", + (BTC32mi8 i32mem:$mem, i32i8imm:$imm), 0, "att">; +def : InstAlias<"btr\t{$imm, $mem|$mem, $imm}", + (BTR32mi8 i32mem:$mem, i32i8imm:$imm), 0, "att">; +def : InstAlias<"bts\t{$imm, $mem|$mem, $imm}", + (BTS32mi8 i32mem:$mem, i32i8imm:$imm), 0, "att">; + +// clr aliases. +def : InstAlias<"clr{b}\t$reg", (XOR8rr GR8 :$reg, GR8 :$reg), 0>; +def : InstAlias<"clr{w}\t$reg", (XOR16rr GR16:$reg, GR16:$reg), 0>; +def : InstAlias<"clr{l}\t$reg", (XOR32rr GR32:$reg, GR32:$reg), 0>; +def : InstAlias<"clr{q}\t$reg", (XOR64rr GR64:$reg, GR64:$reg), 0>; + +// lods aliases. Accept the destination being omitted because it's implicit +// in the mnemonic, or the mnemonic suffix being omitted because it's implicit +// in the destination. +def : InstAlias<"lodsb\t$src", (LODSB srcidx8:$src), 0>; +def : InstAlias<"lodsw\t$src", (LODSW srcidx16:$src), 0>; +def : InstAlias<"lods{l|d}\t$src", (LODSL srcidx32:$src), 0>; +def : InstAlias<"lodsq\t$src", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>; +def : InstAlias<"lods\t{$src, %al|al, $src}", (LODSB srcidx8:$src), 0>; +def : InstAlias<"lods\t{$src, %ax|ax, $src}", (LODSW srcidx16:$src), 0>; +def : InstAlias<"lods\t{$src, %eax|eax, $src}", (LODSL srcidx32:$src), 0>; +def : InstAlias<"lods\t{$src, %rax|rax, $src}", (LODSQ srcidx64:$src), 0>, Requires<[In64BitMode]>; +def : InstAlias<"lods\t$src", (LODSB srcidx8:$src), 0, "intel">; +def : InstAlias<"lods\t$src", (LODSW srcidx16:$src), 0, "intel">; +def : InstAlias<"lods\t$src", (LODSL srcidx32:$src), 0, "intel">; +def : InstAlias<"lods\t$src", (LODSQ srcidx64:$src), 0, "intel">, Requires<[In64BitMode]>; + + +// stos aliases. Accept the source being omitted because it's implicit in +// the mnemonic, or the mnemonic suffix being omitted because it's implicit +// in the source. +def : InstAlias<"stosb\t$dst", (STOSB dstidx8:$dst), 0>; +def : InstAlias<"stosw\t$dst", (STOSW dstidx16:$dst), 0>; +def : InstAlias<"stos{l|d}\t$dst", (STOSL dstidx32:$dst), 0>; +def : InstAlias<"stosq\t$dst", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>; +def : InstAlias<"stos\t{%al, $dst|$dst, al}", (STOSB dstidx8:$dst), 0>; +def : InstAlias<"stos\t{%ax, $dst|$dst, ax}", (STOSW dstidx16:$dst), 0>; +def : InstAlias<"stos\t{%eax, $dst|$dst, eax}", (STOSL dstidx32:$dst), 0>; +def : InstAlias<"stos\t{%rax, $dst|$dst, rax}", (STOSQ dstidx64:$dst), 0>, Requires<[In64BitMode]>; +def : InstAlias<"stos\t$dst", (STOSB dstidx8:$dst), 0, "intel">; +def : InstAlias<"stos\t$dst", (STOSW dstidx16:$dst), 0, "intel">; +def : InstAlias<"stos\t$dst", (STOSL dstidx32:$dst), 0, "intel">; +def : InstAlias<"stos\t$dst", (STOSQ dstidx64:$dst), 0, "intel">, Requires<[In64BitMode]>; + + +// scas aliases. Accept the destination being omitted because it's implicit +// in the mnemonic, or the mnemonic suffix being omitted because it's implicit +// in the destination. +def : InstAlias<"scasb\t$dst", (SCASB dstidx8:$dst), 0>; +def : InstAlias<"scasw\t$dst", (SCASW dstidx16:$dst), 0>; +def : InstAlias<"scas{l|d}\t$dst", (SCASL dstidx32:$dst), 0>; +def : InstAlias<"scasq\t$dst", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>; +def : InstAlias<"scas\t{$dst, %al|al, $dst}", (SCASB dstidx8:$dst), 0>; +def : InstAlias<"scas\t{$dst, %ax|ax, $dst}", (SCASW dstidx16:$dst), 0>; +def : InstAlias<"scas\t{$dst, %eax|eax, $dst}", (SCASL dstidx32:$dst), 0>; +def : InstAlias<"scas\t{$dst, %rax|rax, $dst}", (SCASQ dstidx64:$dst), 0>, Requires<[In64BitMode]>; +def : InstAlias<"scas\t$dst", (SCASB dstidx8:$dst), 0, "intel">; +def : InstAlias<"scas\t$dst", (SCASW dstidx16:$dst), 0, "intel">; +def : InstAlias<"scas\t$dst", (SCASL dstidx32:$dst), 0, "intel">; +def : InstAlias<"scas\t$dst", (SCASQ dstidx64:$dst), 0, "intel">, Requires<[In64BitMode]>; + +// cmps aliases. Mnemonic suffix being omitted because it's implicit +// in the destination. +def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSB dstidx8:$dst, srcidx8:$src), 0, "intel">; +def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSW dstidx16:$dst, srcidx16:$src), 0, "intel">; +def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSL dstidx32:$dst, srcidx32:$src), 0, "intel">; +def : InstAlias<"cmps\t{$dst, $src|$src, $dst}", (CMPSQ dstidx64:$dst, srcidx64:$src), 0, "intel">, Requires<[In64BitMode]>; + +// movs aliases. Mnemonic suffix being omitted because it's implicit +// in the destination. +def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSB dstidx8:$dst, srcidx8:$src), 0, "intel">; +def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSW dstidx16:$dst, srcidx16:$src), 0, "intel">; +def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSL dstidx32:$dst, srcidx32:$src), 0, "intel">; +def : InstAlias<"movs\t{$src, $dst|$dst, $src}", (MOVSQ dstidx64:$dst, srcidx64:$src), 0, "intel">, Requires<[In64BitMode]>; + +// div and idiv aliases for explicit A register. +def : InstAlias<"div{b}\t{$src, %al|al, $src}", (DIV8r GR8 :$src)>; +def : InstAlias<"div{w}\t{$src, %ax|ax, $src}", (DIV16r GR16:$src)>; +def : InstAlias<"div{l}\t{$src, %eax|eax, $src}", (DIV32r GR32:$src)>; +def : InstAlias<"div{q}\t{$src, %rax|rax, $src}", (DIV64r GR64:$src)>; +def : InstAlias<"div{b}\t{$src, %al|al, $src}", (DIV8m i8mem :$src)>; +def : InstAlias<"div{w}\t{$src, %ax|ax, $src}", (DIV16m i16mem:$src)>; +def : InstAlias<"div{l}\t{$src, %eax|eax, $src}", (DIV32m i32mem:$src)>; +def : InstAlias<"div{q}\t{$src, %rax|rax, $src}", (DIV64m i64mem:$src)>; +def : InstAlias<"idiv{b}\t{$src, %al|al, $src}", (IDIV8r GR8 :$src)>; +def : InstAlias<"idiv{w}\t{$src, %ax|ax, $src}", (IDIV16r GR16:$src)>; +def : InstAlias<"idiv{l}\t{$src, %eax|eax, $src}", (IDIV32r GR32:$src)>; +def : InstAlias<"idiv{q}\t{$src, %rax|rax, $src}", (IDIV64r GR64:$src)>; +def : InstAlias<"idiv{b}\t{$src, %al|al, $src}", (IDIV8m i8mem :$src)>; +def : InstAlias<"idiv{w}\t{$src, %ax|ax, $src}", (IDIV16m i16mem:$src)>; +def : InstAlias<"idiv{l}\t{$src, %eax|eax, $src}", (IDIV32m i32mem:$src)>; +def : InstAlias<"idiv{q}\t{$src, %rax|rax, $src}", (IDIV64m i64mem:$src)>; + + + +// Various unary fpstack operations default to operating on ST1. +// For example, "fxch" -> "fxch %st(1)" +def : InstAlias<"faddp", (ADD_FPrST0 ST1), 0>; +def: InstAlias<"fadd", (ADD_FPrST0 ST1), 0>; +def : InstAlias<"fsub{|r}p", (SUBR_FPrST0 ST1), 0>; +def : InstAlias<"fsub{r|}p", (SUB_FPrST0 ST1), 0>; +def : InstAlias<"fmul", (MUL_FPrST0 ST1), 0>; +def : InstAlias<"fmulp", (MUL_FPrST0 ST1), 0>; +def : InstAlias<"fdiv{|r}p", (DIVR_FPrST0 ST1), 0>; +def : InstAlias<"fdiv{r|}p", (DIV_FPrST0 ST1), 0>; +def : InstAlias<"fxch", (XCH_F ST1), 0>; +def : InstAlias<"fcom", (COM_FST0r ST1), 0>; +def : InstAlias<"fcomp", (COMP_FST0r ST1), 0>; +def : InstAlias<"fcomi", (COM_FIr ST1), 0>; +def : InstAlias<"fcompi", (COM_FIPr ST1), 0>; +def : InstAlias<"fucom", (UCOM_Fr ST1), 0>; +def : InstAlias<"fucomp", (UCOM_FPr ST1), 0>; +def : InstAlias<"fucomi", (UCOM_FIr ST1), 0>; +def : InstAlias<"fucompi", (UCOM_FIPr ST1), 0>; + +// Handle fmul/fadd/fsub/fdiv instructions with explicitly written st(0) op. +// For example, "fadd %st(4), %st(0)" -> "fadd %st(4)". We also disambiguate +// instructions like "fadd %st(0), %st(0)" as "fadd %st(0)" for consistency with +// gas. +multiclass FpUnaryAlias<string Mnemonic, Instruction Inst, bit EmitAlias = 1> { + def : InstAlias<!strconcat(Mnemonic, "\t{$op, %st(0)|st(0), $op}"), + (Inst RST:$op), EmitAlias>; + def : InstAlias<!strconcat(Mnemonic, "\t{%st(0), %st(0)|st(0), st(0)}"), + (Inst ST0), EmitAlias>; +} + +defm : FpUnaryAlias<"fadd", ADD_FST0r>; +defm : FpUnaryAlias<"faddp", ADD_FPrST0, 0>; +defm : FpUnaryAlias<"fsub", SUB_FST0r>; +defm : FpUnaryAlias<"fsub{|r}p", SUBR_FPrST0>; +defm : FpUnaryAlias<"fsubr", SUBR_FST0r>; +defm : FpUnaryAlias<"fsub{r|}p", SUB_FPrST0>; +defm : FpUnaryAlias<"fmul", MUL_FST0r>; +defm : FpUnaryAlias<"fmulp", MUL_FPrST0>; +defm : FpUnaryAlias<"fdiv", DIV_FST0r>; +defm : FpUnaryAlias<"fdiv{|r}p", DIVR_FPrST0>; +defm : FpUnaryAlias<"fdivr", DIVR_FST0r>; +defm : FpUnaryAlias<"fdiv{r|}p", DIV_FPrST0>; +defm : FpUnaryAlias<"fcomi", COM_FIr, 0>; +defm : FpUnaryAlias<"fucomi", UCOM_FIr, 0>; +defm : FpUnaryAlias<"fcompi", COM_FIPr>; +defm : FpUnaryAlias<"fucompi", UCOM_FIPr>; + + +// Handle "f{mulp,addp} st(0), $op" the same as "f{mulp,addp} $op", since they +// commute. We also allow fdiv[r]p/fsubrp even though they don't commute, +// solely because gas supports it. +def : InstAlias<"faddp\t{%st(0), $op|$op, st(0)}", (ADD_FPrST0 RST:$op), 0>; +def : InstAlias<"fmulp\t{%st(0), $op|$op, st(0)}", (MUL_FPrST0 RST:$op)>; +def : InstAlias<"fsub{|r}p\t{%st(0), $op|$op, st(0)}", (SUBR_FPrST0 RST:$op)>; +def : InstAlias<"fsub{r|}p\t{%st(0), $op|$op, st(0)}", (SUB_FPrST0 RST:$op)>; +def : InstAlias<"fdiv{|r}p\t{%st(0), $op|$op, st(0)}", (DIVR_FPrST0 RST:$op)>; +def : InstAlias<"fdiv{r|}p\t{%st(0), $op|$op, st(0)}", (DIV_FPrST0 RST:$op)>; + +def : InstAlias<"fnstsw" , (FNSTSW16r), 0>; + +// lcall and ljmp aliases. This seems to be an odd mapping in 64-bit mode, but +// this is compatible with what GAS does. +def : InstAlias<"lcall\t$seg : $off", (FARCALL32i i32imm:$off, i16imm:$seg), 0>, Requires<[In32BitMode]>; +def : InstAlias<"ljmp\t$seg : $off", (FARJMP32i i32imm:$off, i16imm:$seg), 0>, Requires<[In32BitMode]>; +def : InstAlias<"lcall\t{*}$dst", (FARCALL32m opaquemem:$dst), 0>, Requires<[Not16BitMode]>; +def : InstAlias<"ljmp\t{*}$dst", (FARJMP32m opaquemem:$dst), 0>, Requires<[Not16BitMode]>; +def : InstAlias<"lcall\t$seg : $off", (FARCALL16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>; +def : InstAlias<"ljmp\t$seg : $off", (FARJMP16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>; +def : InstAlias<"lcall\t{*}$dst", (FARCALL16m opaquemem:$dst), 0>, Requires<[In16BitMode]>; +def : InstAlias<"ljmp\t{*}$dst", (FARJMP16m opaquemem:$dst), 0>, Requires<[In16BitMode]>; + +def : InstAlias<"jmp\t{*}$dst", (JMP64m i64mem:$dst), 0, "att">, Requires<[In64BitMode]>; +def : InstAlias<"jmp\t{*}$dst", (JMP32m i32mem:$dst), 0, "att">, Requires<[In32BitMode]>; +def : InstAlias<"jmp\t{*}$dst", (JMP16m i16mem:$dst), 0, "att">, Requires<[In16BitMode]>; + + +// "imul <imm>, B" is an alias for "imul <imm>, B, B". +def : InstAlias<"imul{w}\t{$imm, $r|$r, $imm}", (IMUL16rri GR16:$r, GR16:$r, i16imm:$imm), 0>; +def : InstAlias<"imul{w}\t{$imm, $r|$r, $imm}", (IMUL16rri8 GR16:$r, GR16:$r, i16i8imm:$imm), 0>; +def : InstAlias<"imul{l}\t{$imm, $r|$r, $imm}", (IMUL32rri GR32:$r, GR32:$r, i32imm:$imm), 0>; +def : InstAlias<"imul{l}\t{$imm, $r|$r, $imm}", (IMUL32rri8 GR32:$r, GR32:$r, i32i8imm:$imm), 0>; +def : InstAlias<"imul{q}\t{$imm, $r|$r, $imm}", (IMUL64rri32 GR64:$r, GR64:$r, i64i32imm:$imm), 0>; +def : InstAlias<"imul{q}\t{$imm, $r|$r, $imm}", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm), 0>; + +// ins aliases. Accept the mnemonic suffix being omitted because it's implicit +// in the destination. +def : InstAlias<"ins\t{%dx, $dst|$dst, dx}", (INSB dstidx8:$dst), 0, "intel">; +def : InstAlias<"ins\t{%dx, $dst|$dst, dx}", (INSW dstidx16:$dst), 0, "intel">; +def : InstAlias<"ins\t{%dx, $dst|$dst, dx}", (INSL dstidx32:$dst), 0, "intel">; + +// outs aliases. Accept the mnemonic suffix being omitted because it's implicit +// in the source. +def : InstAlias<"outs\t{$src, %dx|dx, $src}", (OUTSB srcidx8:$src), 0, "intel">; +def : InstAlias<"outs\t{$src, %dx|dx, $src}", (OUTSW srcidx16:$src), 0, "intel">; +def : InstAlias<"outs\t{$src, %dx|dx, $src}", (OUTSL srcidx32:$src), 0, "intel">; + +// inb %dx -> inb %al, %dx +def : InstAlias<"inb\t{%dx|dx}", (IN8rr), 0>; +def : InstAlias<"inw\t{%dx|dx}", (IN16rr), 0>; +def : InstAlias<"inl\t{%dx|dx}", (IN32rr), 0>; +def : InstAlias<"inb\t$port", (IN8ri u8imm:$port), 0>; +def : InstAlias<"inw\t$port", (IN16ri u8imm:$port), 0>; +def : InstAlias<"inl\t$port", (IN32ri u8imm:$port), 0>; + + +// jmp and call aliases for lcall and ljmp. jmp $42,$5 -> ljmp +def : InstAlias<"call\t$seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>; +def : InstAlias<"jmp\t$seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>; +def : InstAlias<"call\t$seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>, Requires<[In32BitMode]>; +def : InstAlias<"jmp\t$seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>, Requires<[In32BitMode]>; +def : InstAlias<"callw\t$seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>, Requires<[Not64BitMode]>; +def : InstAlias<"jmpw\t$seg, $off", (FARJMP16i i16imm:$off, i16imm:$seg)>, Requires<[Not64BitMode]>; +def : InstAlias<"calll\t$seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>, Requires<[Not64BitMode]>; +def : InstAlias<"jmpl\t$seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>, Requires<[Not64BitMode]>; + +// Match 'movq <largeimm>, <reg>' as an alias for movabsq. +def : InstAlias<"mov{q}\t{$imm, $reg|$reg, $imm}", (MOV64ri GR64:$reg, i64imm:$imm), 0>; + +// Match 'movd GR64, MMX' as an alias for movq to be compatible with gas, +// which supports this due to an old AMD documentation bug when 64-bit mode was +// created. +def : InstAlias<"movd\t{$src, $dst|$dst, $src}", + (MMX_MOVD64to64rr VR64:$dst, GR64:$src), 0>; +def : InstAlias<"movd\t{$src, $dst|$dst, $src}", + (MMX_MOVD64from64rr GR64:$dst, VR64:$src), 0>; + +// movsx aliases +def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX16rr8 GR16:$dst, GR8:$src), 0, "att">; +def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX16rm8 GR16:$dst, i8mem:$src), 0, "att">; +def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX32rr8 GR32:$dst, GR8:$src), 0, "att">; +def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX32rr16 GR32:$dst, GR16:$src), 0, "att">; +def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX64rr8 GR64:$dst, GR8:$src), 0, "att">; +def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX64rr16 GR64:$dst, GR16:$src), 0, "att">; +def : InstAlias<"movsx\t{$src, $dst|$dst, $src}", (MOVSX64rr32 GR64:$dst, GR32:$src), 0, "att">; + +// movzx aliases +def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX16rr8 GR16:$dst, GR8:$src), 0, "att">; +def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX16rm8 GR16:$dst, i8mem:$src), 0, "att">; +def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX32rr8 GR32:$dst, GR8:$src), 0, "att">; +def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX32rr16 GR32:$dst, GR16:$src), 0, "att">; +def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX64rr8 GR64:$dst, GR8:$src), 0, "att">; +def : InstAlias<"movzx\t{$src, $dst|$dst, $src}", (MOVZX64rr16 GR64:$dst, GR16:$src), 0, "att">; +// Note: No GR32->GR64 movzx form. + +// outb %dx -> outb %al, %dx +def : InstAlias<"outb\t{%dx|dx}", (OUT8rr), 0>; +def : InstAlias<"outw\t{%dx|dx}", (OUT16rr), 0>; +def : InstAlias<"outl\t{%dx|dx}", (OUT32rr), 0>; +def : InstAlias<"outb\t$port", (OUT8ir u8imm:$port), 0>; +def : InstAlias<"outw\t$port", (OUT16ir u8imm:$port), 0>; +def : InstAlias<"outl\t$port", (OUT32ir u8imm:$port), 0>; + +// 'sldt <mem>' can be encoded with either sldtw or sldtq with the same +// effect (both store to a 16-bit mem). Force to sldtw to avoid ambiguity +// errors, since its encoding is the most compact. +def : InstAlias<"sldt $mem", (SLDT16m i16mem:$mem), 0>; + +// shld/shrd op,op -> shld op, op, CL +def : InstAlias<"shld{w}\t{$r2, $r1|$r1, $r2}", (SHLD16rrCL GR16:$r1, GR16:$r2), 0>; +def : InstAlias<"shld{l}\t{$r2, $r1|$r1, $r2}", (SHLD32rrCL GR32:$r1, GR32:$r2), 0>; +def : InstAlias<"shld{q}\t{$r2, $r1|$r1, $r2}", (SHLD64rrCL GR64:$r1, GR64:$r2), 0>; +def : InstAlias<"shrd{w}\t{$r2, $r1|$r1, $r2}", (SHRD16rrCL GR16:$r1, GR16:$r2), 0>; +def : InstAlias<"shrd{l}\t{$r2, $r1|$r1, $r2}", (SHRD32rrCL GR32:$r1, GR32:$r2), 0>; +def : InstAlias<"shrd{q}\t{$r2, $r1|$r1, $r2}", (SHRD64rrCL GR64:$r1, GR64:$r2), 0>; + +def : InstAlias<"shld{w}\t{$reg, $mem|$mem, $reg}", (SHLD16mrCL i16mem:$mem, GR16:$reg), 0>; +def : InstAlias<"shld{l}\t{$reg, $mem|$mem, $reg}", (SHLD32mrCL i32mem:$mem, GR32:$reg), 0>; +def : InstAlias<"shld{q}\t{$reg, $mem|$mem, $reg}", (SHLD64mrCL i64mem:$mem, GR64:$reg), 0>; +def : InstAlias<"shrd{w}\t{$reg, $mem|$mem, $reg}", (SHRD16mrCL i16mem:$mem, GR16:$reg), 0>; +def : InstAlias<"shrd{l}\t{$reg, $mem|$mem, $reg}", (SHRD32mrCL i32mem:$mem, GR32:$reg), 0>; +def : InstAlias<"shrd{q}\t{$reg, $mem|$mem, $reg}", (SHRD64mrCL i64mem:$mem, GR64:$reg), 0>; + +/* FIXME: This is disabled because the asm matcher is currently incapable of + * matching a fixed immediate like $1. +// "shl X, $1" is an alias for "shl X". +multiclass ShiftRotateByOneAlias<string Mnemonic, string Opc> { + def : InstAlias<!strconcat(Mnemonic, "b $op, $$1"), + (!cast<Instruction>(!strconcat(Opc, "8r1")) GR8:$op)>; + def : InstAlias<!strconcat(Mnemonic, "w $op, $$1"), + (!cast<Instruction>(!strconcat(Opc, "16r1")) GR16:$op)>; + def : InstAlias<!strconcat(Mnemonic, "l $op, $$1"), + (!cast<Instruction>(!strconcat(Opc, "32r1")) GR32:$op)>; + def : InstAlias<!strconcat(Mnemonic, "q $op, $$1"), + (!cast<Instruction>(!strconcat(Opc, "64r1")) GR64:$op)>; + def : InstAlias<!strconcat(Mnemonic, "b $op, $$1"), + (!cast<Instruction>(!strconcat(Opc, "8m1")) i8mem:$op)>; + def : InstAlias<!strconcat(Mnemonic, "w $op, $$1"), + (!cast<Instruction>(!strconcat(Opc, "16m1")) i16mem:$op)>; + def : InstAlias<!strconcat(Mnemonic, "l $op, $$1"), + (!cast<Instruction>(!strconcat(Opc, "32m1")) i32mem:$op)>; + def : InstAlias<!strconcat(Mnemonic, "q $op, $$1"), + (!cast<Instruction>(!strconcat(Opc, "64m1")) i64mem:$op)>; +} + +defm : ShiftRotateByOneAlias<"rcl", "RCL">; +defm : ShiftRotateByOneAlias<"rcr", "RCR">; +defm : ShiftRotateByOneAlias<"rol", "ROL">; +defm : ShiftRotateByOneAlias<"ror", "ROR">; +FIXME */ + +// test: We accept "testX <reg>, <mem>" and "testX <mem>, <reg>" as synonyms. +def : InstAlias<"test{b}\t{$mem, $val|$val, $mem}", + (TEST8mr i8mem :$mem, GR8 :$val), 0>; +def : InstAlias<"test{w}\t{$mem, $val|$val, $mem}", + (TEST16mr i16mem:$mem, GR16:$val), 0>; +def : InstAlias<"test{l}\t{$mem, $val|$val, $mem}", + (TEST32mr i32mem:$mem, GR32:$val), 0>; +def : InstAlias<"test{q}\t{$mem, $val|$val, $mem}", + (TEST64mr i64mem:$mem, GR64:$val), 0>; + +// xchg: We accept "xchgX <reg>, <mem>" and "xchgX <mem>, <reg>" as synonyms. +def : InstAlias<"xchg{b}\t{$mem, $val|$val, $mem}", + (XCHG8rm GR8 :$val, i8mem :$mem), 0>; +def : InstAlias<"xchg{w}\t{$mem, $val|$val, $mem}", + (XCHG16rm GR16:$val, i16mem:$mem), 0>; +def : InstAlias<"xchg{l}\t{$mem, $val|$val, $mem}", + (XCHG32rm GR32:$val, i32mem:$mem), 0>; +def : InstAlias<"xchg{q}\t{$mem, $val|$val, $mem}", + (XCHG64rm GR64:$val, i64mem:$mem), 0>; + +// xchg: We accept "xchgX <reg>, %eax" and "xchgX %eax, <reg>" as synonyms. +def : InstAlias<"xchg{w}\t{%ax, $src|$src, ax}", (XCHG16ar GR16:$src), 0>; +def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}", (XCHG32ar GR32:$src), 0>; +def : InstAlias<"xchg{q}\t{%rax, $src|$src, rax}", (XCHG64ar GR64:$src), 0>; + +// In 64-bit mode, xchg %eax, %eax can't be encoded with the 0x90 opcode we +// would get by default because it's defined as NOP. But xchg %eax, %eax implies +// implicit zeroing of the upper 32 bits. So alias to the longer encoding. +def : InstAlias<"xchg{l}\t{%eax, %eax|eax, eax}", + (XCHG32rr EAX, EAX), 0>, Requires<[In64BitMode]>; + +// xchg %rax, %rax is a nop in x86-64 and can be encoded as such. Without this +// we emit an unneeded REX.w prefix. +def : InstAlias<"xchg{q}\t{%rax, %rax|rax, rax}", (NOOP), 0>; + +// These aliases exist to get the parser to prioritize matching 8-bit +// immediate encodings over matching the implicit ax/eax/rax encodings. By +// explicitly mentioning the A register here, these entries will be ordered +// first due to the more explicit immediate type. +def : InstAlias<"adc{w}\t{$imm, %ax|ax, $imm}", (ADC16ri8 AX, i16i8imm:$imm), 0>; +def : InstAlias<"add{w}\t{$imm, %ax|ax, $imm}", (ADD16ri8 AX, i16i8imm:$imm), 0>; +def : InstAlias<"and{w}\t{$imm, %ax|ax, $imm}", (AND16ri8 AX, i16i8imm:$imm), 0>; +def : InstAlias<"cmp{w}\t{$imm, %ax|ax, $imm}", (CMP16ri8 AX, i16i8imm:$imm), 0>; +def : InstAlias<"or{w}\t{$imm, %ax|ax, $imm}", (OR16ri8 AX, i16i8imm:$imm), 0>; +def : InstAlias<"sbb{w}\t{$imm, %ax|ax, $imm}", (SBB16ri8 AX, i16i8imm:$imm), 0>; +def : InstAlias<"sub{w}\t{$imm, %ax|ax, $imm}", (SUB16ri8 AX, i16i8imm:$imm), 0>; +def : InstAlias<"xor{w}\t{$imm, %ax|ax, $imm}", (XOR16ri8 AX, i16i8imm:$imm), 0>; + +def : InstAlias<"adc{l}\t{$imm, %eax|eax, $imm}", (ADC32ri8 EAX, i32i8imm:$imm), 0>; +def : InstAlias<"add{l}\t{$imm, %eax|eax, $imm}", (ADD32ri8 EAX, i32i8imm:$imm), 0>; +def : InstAlias<"and{l}\t{$imm, %eax|eax, $imm}", (AND32ri8 EAX, i32i8imm:$imm), 0>; +def : InstAlias<"cmp{l}\t{$imm, %eax|eax, $imm}", (CMP32ri8 EAX, i32i8imm:$imm), 0>; +def : InstAlias<"or{l}\t{$imm, %eax|eax, $imm}", (OR32ri8 EAX, i32i8imm:$imm), 0>; +def : InstAlias<"sbb{l}\t{$imm, %eax|eax, $imm}", (SBB32ri8 EAX, i32i8imm:$imm), 0>; +def : InstAlias<"sub{l}\t{$imm, %eax|eax, $imm}", (SUB32ri8 EAX, i32i8imm:$imm), 0>; +def : InstAlias<"xor{l}\t{$imm, %eax|eax, $imm}", (XOR32ri8 EAX, i32i8imm:$imm), 0>; + +def : InstAlias<"adc{q}\t{$imm, %rax|rax, $imm}", (ADC64ri8 RAX, i64i8imm:$imm), 0>; +def : InstAlias<"add{q}\t{$imm, %rax|rax, $imm}", (ADD64ri8 RAX, i64i8imm:$imm), 0>; +def : InstAlias<"and{q}\t{$imm, %rax|rax, $imm}", (AND64ri8 RAX, i64i8imm:$imm), 0>; +def : InstAlias<"cmp{q}\t{$imm, %rax|rax, $imm}", (CMP64ri8 RAX, i64i8imm:$imm), 0>; +def : InstAlias<"or{q}\t{$imm, %rax|rax, $imm}", (OR64ri8 RAX, i64i8imm:$imm), 0>; +def : InstAlias<"sbb{q}\t{$imm, %rax|rax, $imm}", (SBB64ri8 RAX, i64i8imm:$imm), 0>; +def : InstAlias<"sub{q}\t{$imm, %rax|rax, $imm}", (SUB64ri8 RAX, i64i8imm:$imm), 0>; +def : InstAlias<"xor{q}\t{$imm, %rax|rax, $imm}", (XOR64ri8 RAX, i64i8imm:$imm), 0>; diff --git a/capstone/suite/synctools/tablegen/X86/back/X86InstrMMX.td b/capstone/suite/synctools/tablegen/X86/back/X86InstrMMX.td new file mode 100644 index 000000000..aefeffedf --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86InstrMMX.td @@ -0,0 +1,612 @@ +//===-- X86InstrMMX.td - Describe the MMX Instruction Set --*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 MMX instruction set, defining the instructions, +// and properties of the instructions which are needed for code generation, +// machine code emission, and analysis. +// +// All instructions that use MMX should be in this file, even if they also use +// SSE. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// MMX Multiclasses +//===----------------------------------------------------------------------===// + +// Alias instruction that maps zero vector to pxor mmx. +// This is expanded by ExpandPostRAPseudos to an pxor. +// We set canFoldAsLoad because this can be converted to a constant-pool +// load of an all-zeros value if folding it would be beneficial. +let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, + isPseudo = 1, SchedRW = [WriteZero] in { +def MMX_SET0 : I<0, Pseudo, (outs VR64:$dst), (ins), "", []>; +} + +let Constraints = "$src1 = $dst" in { + // MMXI_binop_rm_int - Simple MMX binary operator based on intrinsic. + // When this is cleaned up, remove the FIXME from X86RecognizableInstr.cpp. + multiclass MMXI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId, + X86FoldableSchedWrite sched, bit Commutable = 0, + X86MemOperand OType = i64mem> { + def irr : MMXI<opc, MRMSrcReg, (outs VR64:$dst), + (ins VR64:$src1, VR64:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))]>, + Sched<[sched]> { + let isCommutable = Commutable; + } + def irm : MMXI<opc, MRMSrcMem, (outs VR64:$dst), + (ins VR64:$src1, OType:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + [(set VR64:$dst, (IntId VR64:$src1, + (bitconvert (load_mmx addr:$src2))))]>, + Sched<[sched.Folded, ReadAfterLd]>; + } + + multiclass MMXI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm, + string OpcodeStr, Intrinsic IntId, + Intrinsic IntId2, X86FoldableSchedWrite sched, + X86FoldableSchedWrite schedImm> { + def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst), + (ins VR64:$src1, VR64:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))]>, + Sched<[sched]>; + def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst), + (ins VR64:$src1, i64mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + [(set VR64:$dst, (IntId VR64:$src1, + (bitconvert (load_mmx addr:$src2))))]>, + Sched<[sched.Folded, ReadAfterLd]>; + def ri : MMXIi8<opc2, ImmForm, (outs VR64:$dst), + (ins VR64:$src1, i32u8imm:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + [(set VR64:$dst, (IntId2 VR64:$src1, imm:$src2))]>, + Sched<[schedImm]>; + } +} + +/// Unary MMX instructions requiring SSSE3. +multiclass SS3I_unop_rm_int_mm<bits<8> opc, string OpcodeStr, + Intrinsic IntId64, X86FoldableSchedWrite sched> { + def rr : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR64:$dst, (IntId64 VR64:$src))]>, + Sched<[sched]>; + + def rm : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR64:$dst, + (IntId64 (bitconvert (load_mmx addr:$src))))]>, + Sched<[sched.Folded]>; +} + +/// Binary MMX instructions requiring SSSE3. +let ImmT = NoImm, Constraints = "$src1 = $dst" in { +multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr, + Intrinsic IntId64, X86FoldableSchedWrite sched, + bit Commutable = 0> { + let isCommutable = Commutable in + def rr : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst), + (ins VR64:$src1, VR64:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))]>, + Sched<[sched]>; + def rm : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst), + (ins VR64:$src1, i64mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + [(set VR64:$dst, + (IntId64 VR64:$src1, + (bitconvert (load_mmx addr:$src2))))]>, + Sched<[sched.Folded, ReadAfterLd]>; +} +} + +/// PALIGN MMX instructions (require SSSE3). +multiclass ssse3_palign_mm<string asm, Intrinsic IntId, + X86FoldableSchedWrite sched> { + def rri : MMXSS3AI<0x0F, MRMSrcReg, (outs VR64:$dst), + (ins VR64:$src1, VR64:$src2, u8imm:$src3), + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2, (i8 imm:$src3)))]>, + Sched<[sched]>; + def rmi : MMXSS3AI<0x0F, MRMSrcMem, (outs VR64:$dst), + (ins VR64:$src1, i64mem:$src2, u8imm:$src3), + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR64:$dst, (IntId VR64:$src1, + (bitconvert (load_mmx addr:$src2)), (i8 imm:$src3)))]>, + Sched<[sched.Folded, ReadAfterLd]>; +} + +multiclass sse12_cvt_pint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, + Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag, + string asm, X86FoldableSchedWrite sched, Domain d> { + def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm, + [(set DstRC:$dst, (Int SrcRC:$src))], d>, + Sched<[sched]>; + def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm, + [(set DstRC:$dst, (Int (ld_frag addr:$src)))], d>, + Sched<[sched.Folded]>; +} + +multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC, + RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop, + PatFrag ld_frag, string asm, Domain d> { + def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst), + (ins DstRC:$src1, SrcRC:$src2), asm, + [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))], d>, + Sched<[WriteCvtI2PS]>; + def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst), + (ins DstRC:$src1, x86memop:$src2), asm, + [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))], d>, + Sched<[WriteCvtI2PS.Folded]>; +} + +//===----------------------------------------------------------------------===// +// MMX EMMS Instruction +//===----------------------------------------------------------------------===// + +let SchedRW = [WriteEMMS] in +def MMX_EMMS : MMXI<0x77, RawFrm, (outs), (ins), "emms", [(int_x86_mmx_emms)]>; + +//===----------------------------------------------------------------------===// +// MMX Scalar Instructions +//===----------------------------------------------------------------------===// + +// Data Transfer Instructions +def MMX_MOVD64rr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set VR64:$dst, + (x86mmx (scalar_to_vector GR32:$src)))]>, + Sched<[WriteVecMoveFromGpr]>; +def MMX_MOVD64rm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set VR64:$dst, + (x86mmx (scalar_to_vector (loadi32 addr:$src))))]>, + Sched<[WriteVecLoad]>; + +let Predicates = [HasMMX] in { + def : Pat<(x86mmx (MMX_X86movw2d GR32:$src)), + (MMX_MOVD64rr GR32:$src)>; + def : Pat<(x86mmx (MMX_X86movw2d (i32 0))), + (MMX_SET0)>; + def : Pat<(x86mmx (MMX_X86movw2d (loadi32 addr:$src))), + (MMX_MOVD64rm addr:$src)>; +} + +let mayStore = 1 in +def MMX_MOVD64mr : MMXI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR64:$src), + "movd\t{$src, $dst|$dst, $src}", []>, + Sched<[WriteVecStore]>; + +def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR64:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, + (MMX_X86movd2w (x86mmx VR64:$src)))]>, + Sched<[WriteVecMoveToGpr]>, FoldGenData<"MMX_MOVD64rr">; + +let isBitcast = 1 in +def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src), + "movq\t{$src, $dst|$dst, $src}", + [(set VR64:$dst, (bitconvert GR64:$src))]>, + Sched<[WriteVecMoveFromGpr]>; + +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in +def MMX_MOVD64to64rm : MMXRI<0x6E, MRMSrcMem, (outs VR64:$dst), + (ins i64mem:$src), "movq\t{$src, $dst|$dst, $src}", + []>, Sched<[SchedWriteVecMoveLS.MMX.RM]>; + +let isBitcast = 1 in { +def MMX_MOVD64from64rr : MMXRI<0x7E, MRMDestReg, + (outs GR64:$dst), (ins VR64:$src), + "movq\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (bitconvert VR64:$src))]>, + Sched<[WriteVecMoveToGpr]>; +let SchedRW = [WriteVecMove], hasSideEffects = 0, isMoveReg = 1 in { +def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src), + "movq\t{$src, $dst|$dst, $src}", []>; +let isCodeGenOnly = 1, ForceDisassemble = 1 in +def MMX_MOVQ64rr_REV : MMXI<0x7F, MRMDestReg, (outs VR64:$dst), (ins VR64:$src), + "movq\t{$src, $dst|$dst, $src}", []>, + FoldGenData<"MMX_MOVQ64rr">; +} // SchedRW, hasSideEffects, isMoveReg +} // isBitcast + +def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}", + (MMX_MOVQ64rr_REV VR64:$dst, VR64:$src), 0>; + +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in +def MMX_MOVD64from64rm : MMXRI<0x7E, MRMDestMem, + (outs), (ins i64mem:$dst, VR64:$src), + "movq\t{$src, $dst|$dst, $src}", []>, + Sched<[SchedWriteVecMoveLS.MMX.MR]>; + +let SchedRW = [SchedWriteVecMoveLS.MMX.RM] in { +let canFoldAsLoad = 1 in +def MMX_MOVQ64rm : MMXI<0x6F, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src), + "movq\t{$src, $dst|$dst, $src}", + [(set VR64:$dst, (load_mmx addr:$src))]>; +} // SchedRW + +let SchedRW = [SchedWriteVecMoveLS.MMX.MR] in +def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src), + "movq\t{$src, $dst|$dst, $src}", + [(store (x86mmx VR64:$src), addr:$dst)]>; + +let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in { +def MMX_MOVDQ2Qrr : MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst), + (ins VR128:$src), "movdq2q\t{$src, $dst|$dst, $src}", + [(set VR64:$dst, + (x86mmx (bitconvert + (i64 (extractelt (v2i64 VR128:$src), + (iPTR 0))))))]>; + +def MMX_MOVQ2DQrr : MMXS2SIi8<0xD6, MRMSrcReg, (outs VR128:$dst), + (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2i64 + (scalar_to_vector + (i64 (bitconvert (x86mmx VR64:$src))))))]>; + +let isCodeGenOnly = 1, hasSideEffects = 1 in { +def MMX_MOVQ2FR64rr: MMXS2SIi8<0xD6, MRMSrcReg, (outs FR64:$dst), + (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}", + []>; + +def MMX_MOVFR642Qrr: MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst), + (ins FR64:$src), "movdq2q\t{$src, $dst|$dst, $src}", + []>; +} +} // SchedRW + +let Predicates = [HasMMX, HasSSE1] in +def MMX_MOVNTQmr : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src), + "movntq\t{$src, $dst|$dst, $src}", + [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)]>, + Sched<[SchedWriteVecMoveLSNT.MMX.MR]>; + +let Predicates = [HasMMX] in { + // movd to MMX register zero-extends + def : Pat<(x86mmx (X86vzmovl (x86mmx (scalar_to_vector GR32:$src)))), + (MMX_MOVD64rr GR32:$src)>; + def : Pat<(x86mmx (X86vzmovl (x86mmx (scalar_to_vector (loadi32 addr:$src))))), + (MMX_MOVD64rm addr:$src)>; +} + +// Arithmetic Instructions +defm MMX_PABSB : SS3I_unop_rm_int_mm<0x1C, "pabsb", int_x86_ssse3_pabs_b, + SchedWriteVecALU.MMX>; +defm MMX_PABSW : SS3I_unop_rm_int_mm<0x1D, "pabsw", int_x86_ssse3_pabs_w, + SchedWriteVecALU.MMX>; +defm MMX_PABSD : SS3I_unop_rm_int_mm<0x1E, "pabsd", int_x86_ssse3_pabs_d, + SchedWriteVecALU.MMX>; +// -- Addition +defm MMX_PADDB : MMXI_binop_rm_int<0xFC, "paddb", int_x86_mmx_padd_b, + SchedWriteVecALU.MMX, 1>; +defm MMX_PADDW : MMXI_binop_rm_int<0xFD, "paddw", int_x86_mmx_padd_w, + SchedWriteVecALU.MMX, 1>; +defm MMX_PADDD : MMXI_binop_rm_int<0xFE, "paddd", int_x86_mmx_padd_d, + SchedWriteVecALU.MMX, 1>; +let Predicates = [HasMMX, HasSSE2] in +defm MMX_PADDQ : MMXI_binop_rm_int<0xD4, "paddq", int_x86_mmx_padd_q, + SchedWriteVecALU.MMX, 1>; +defm MMX_PADDSB : MMXI_binop_rm_int<0xEC, "paddsb" , int_x86_mmx_padds_b, + SchedWriteVecALU.MMX, 1>; +defm MMX_PADDSW : MMXI_binop_rm_int<0xED, "paddsw" , int_x86_mmx_padds_w, + SchedWriteVecALU.MMX, 1>; + +defm MMX_PADDUSB : MMXI_binop_rm_int<0xDC, "paddusb", int_x86_mmx_paddus_b, + SchedWriteVecALU.MMX, 1>; +defm MMX_PADDUSW : MMXI_binop_rm_int<0xDD, "paddusw", int_x86_mmx_paddus_w, + SchedWriteVecALU.MMX, 1>; + +defm MMX_PHADDW : SS3I_binop_rm_int_mm<0x01, "phaddw", int_x86_ssse3_phadd_w, + SchedWritePHAdd.MMX>; +defm MMX_PHADDD : SS3I_binop_rm_int_mm<0x02, "phaddd", int_x86_ssse3_phadd_d, + SchedWritePHAdd.MMX>; +defm MMX_PHADDSW : SS3I_binop_rm_int_mm<0x03, "phaddsw",int_x86_ssse3_phadd_sw, + SchedWritePHAdd.MMX>; + +// -- Subtraction +defm MMX_PSUBB : MMXI_binop_rm_int<0xF8, "psubb", int_x86_mmx_psub_b, + SchedWriteVecALU.MMX>; +defm MMX_PSUBW : MMXI_binop_rm_int<0xF9, "psubw", int_x86_mmx_psub_w, + SchedWriteVecALU.MMX>; +defm MMX_PSUBD : MMXI_binop_rm_int<0xFA, "psubd", int_x86_mmx_psub_d, + SchedWriteVecALU.MMX>; +let Predicates = [HasMMX, HasSSE2] in +defm MMX_PSUBQ : MMXI_binop_rm_int<0xFB, "psubq", int_x86_mmx_psub_q, + SchedWriteVecALU.MMX>; + +defm MMX_PSUBSB : MMXI_binop_rm_int<0xE8, "psubsb" , int_x86_mmx_psubs_b, + SchedWriteVecALU.MMX>; +defm MMX_PSUBSW : MMXI_binop_rm_int<0xE9, "psubsw" , int_x86_mmx_psubs_w, + SchedWriteVecALU.MMX>; + +defm MMX_PSUBUSB : MMXI_binop_rm_int<0xD8, "psubusb", int_x86_mmx_psubus_b, + SchedWriteVecALU.MMX>; +defm MMX_PSUBUSW : MMXI_binop_rm_int<0xD9, "psubusw", int_x86_mmx_psubus_w, + SchedWriteVecALU.MMX>; + +defm MMX_PHSUBW : SS3I_binop_rm_int_mm<0x05, "phsubw", int_x86_ssse3_phsub_w, + SchedWritePHAdd.MMX>; +defm MMX_PHSUBD : SS3I_binop_rm_int_mm<0x06, "phsubd", int_x86_ssse3_phsub_d, + SchedWritePHAdd.MMX>; +defm MMX_PHSUBSW : SS3I_binop_rm_int_mm<0x07, "phsubsw",int_x86_ssse3_phsub_sw, + SchedWritePHAdd.MMX>; + +// -- Multiplication +defm MMX_PMULLW : MMXI_binop_rm_int<0xD5, "pmullw", int_x86_mmx_pmull_w, + SchedWriteVecIMul.MMX, 1>; + +defm MMX_PMULHW : MMXI_binop_rm_int<0xE5, "pmulhw", int_x86_mmx_pmulh_w, + SchedWriteVecIMul.MMX, 1>; +let Predicates = [HasMMX, HasSSE1] in +defm MMX_PMULHUW : MMXI_binop_rm_int<0xE4, "pmulhuw", int_x86_mmx_pmulhu_w, + SchedWriteVecIMul.MMX, 1>; +let Predicates = [HasMMX, HasSSE2] in +defm MMX_PMULUDQ : MMXI_binop_rm_int<0xF4, "pmuludq", int_x86_mmx_pmulu_dq, + SchedWriteVecIMul.MMX, 1>; +defm MMX_PMULHRSW : SS3I_binop_rm_int_mm<0x0B, "pmulhrsw", + int_x86_ssse3_pmul_hr_sw, + SchedWriteVecIMul.MMX, 1>; + +// -- Miscellanea +defm MMX_PMADDWD : MMXI_binop_rm_int<0xF5, "pmaddwd", int_x86_mmx_pmadd_wd, + SchedWriteVecIMul.MMX, 1>; + +defm MMX_PMADDUBSW : SS3I_binop_rm_int_mm<0x04, "pmaddubsw", + int_x86_ssse3_pmadd_ub_sw, + SchedWriteVecIMul.MMX>; +let Predicates = [HasMMX, HasSSE1] in { +defm MMX_PAVGB : MMXI_binop_rm_int<0xE0, "pavgb", int_x86_mmx_pavg_b, + SchedWriteVecALU.MMX, 1>; +defm MMX_PAVGW : MMXI_binop_rm_int<0xE3, "pavgw", int_x86_mmx_pavg_w, + SchedWriteVecALU.MMX, 1>; + +defm MMX_PMINUB : MMXI_binop_rm_int<0xDA, "pminub", int_x86_mmx_pminu_b, + SchedWriteVecALU.MMX, 1>; +defm MMX_PMINSW : MMXI_binop_rm_int<0xEA, "pminsw", int_x86_mmx_pmins_w, + SchedWriteVecALU.MMX, 1>; + +defm MMX_PMAXUB : MMXI_binop_rm_int<0xDE, "pmaxub", int_x86_mmx_pmaxu_b, + SchedWriteVecALU.MMX, 1>; +defm MMX_PMAXSW : MMXI_binop_rm_int<0xEE, "pmaxsw", int_x86_mmx_pmaxs_w, + SchedWriteVecALU.MMX, 1>; + +defm MMX_PSADBW : MMXI_binop_rm_int<0xF6, "psadbw", int_x86_mmx_psad_bw, + SchedWritePSADBW.MMX, 1>; +} + +defm MMX_PSIGNB : SS3I_binop_rm_int_mm<0x08, "psignb", int_x86_ssse3_psign_b, + SchedWriteVecALU.MMX>; +defm MMX_PSIGNW : SS3I_binop_rm_int_mm<0x09, "psignw", int_x86_ssse3_psign_w, + SchedWriteVecALU.MMX>; +defm MMX_PSIGND : SS3I_binop_rm_int_mm<0x0A, "psignd", int_x86_ssse3_psign_d, + SchedWriteVecALU.MMX>; +let Constraints = "$src1 = $dst" in + defm MMX_PALIGNR : ssse3_palign_mm<"palignr", int_x86_mmx_palignr_b, + SchedWriteShuffle.MMX>; + +// Logical Instructions +defm MMX_PAND : MMXI_binop_rm_int<0xDB, "pand", int_x86_mmx_pand, + SchedWriteVecLogic.MMX, 1>; +defm MMX_POR : MMXI_binop_rm_int<0xEB, "por" , int_x86_mmx_por, + SchedWriteVecLogic.MMX, 1>; +defm MMX_PXOR : MMXI_binop_rm_int<0xEF, "pxor", int_x86_mmx_pxor, + SchedWriteVecLogic.MMX, 1>; +defm MMX_PANDN : MMXI_binop_rm_int<0xDF, "pandn", int_x86_mmx_pandn, + SchedWriteVecLogic.MMX>; + +// Shift Instructions +defm MMX_PSRLW : MMXI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw", + int_x86_mmx_psrl_w, int_x86_mmx_psrli_w, + SchedWriteVecShift.MMX, + SchedWriteVecShiftImm.MMX>; +defm MMX_PSRLD : MMXI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld", + int_x86_mmx_psrl_d, int_x86_mmx_psrli_d, + SchedWriteVecShift.MMX, + SchedWriteVecShiftImm.MMX>; +defm MMX_PSRLQ : MMXI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq", + int_x86_mmx_psrl_q, int_x86_mmx_psrli_q, + SchedWriteVecShift.MMX, + SchedWriteVecShiftImm.MMX>; + +defm MMX_PSLLW : MMXI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw", + int_x86_mmx_psll_w, int_x86_mmx_pslli_w, + SchedWriteVecShift.MMX, + SchedWriteVecShiftImm.MMX>; +defm MMX_PSLLD : MMXI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld", + int_x86_mmx_psll_d, int_x86_mmx_pslli_d, + SchedWriteVecShift.MMX, + SchedWriteVecShiftImm.MMX>; +defm MMX_PSLLQ : MMXI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq", + int_x86_mmx_psll_q, int_x86_mmx_pslli_q, + SchedWriteVecShift.MMX, + SchedWriteVecShiftImm.MMX>; + +defm MMX_PSRAW : MMXI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw", + int_x86_mmx_psra_w, int_x86_mmx_psrai_w, + SchedWriteVecShift.MMX, + SchedWriteVecShiftImm.MMX>; +defm MMX_PSRAD : MMXI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad", + int_x86_mmx_psra_d, int_x86_mmx_psrai_d, + SchedWriteVecShift.MMX, + SchedWriteVecShiftImm.MMX>; + +// Comparison Instructions +defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b, + SchedWriteVecALU.MMX>; +defm MMX_PCMPEQW : MMXI_binop_rm_int<0x75, "pcmpeqw", int_x86_mmx_pcmpeq_w, + SchedWriteVecALU.MMX>; +defm MMX_PCMPEQD : MMXI_binop_rm_int<0x76, "pcmpeqd", int_x86_mmx_pcmpeq_d, + SchedWriteVecALU.MMX>; + +defm MMX_PCMPGTB : MMXI_binop_rm_int<0x64, "pcmpgtb", int_x86_mmx_pcmpgt_b, + SchedWriteVecALU.MMX>; +defm MMX_PCMPGTW : MMXI_binop_rm_int<0x65, "pcmpgtw", int_x86_mmx_pcmpgt_w, + SchedWriteVecALU.MMX>; +defm MMX_PCMPGTD : MMXI_binop_rm_int<0x66, "pcmpgtd", int_x86_mmx_pcmpgt_d, + SchedWriteVecALU.MMX>; + +// -- Unpack Instructions +defm MMX_PUNPCKHBW : MMXI_binop_rm_int<0x68, "punpckhbw", + int_x86_mmx_punpckhbw, + SchedWriteShuffle.MMX>; +defm MMX_PUNPCKHWD : MMXI_binop_rm_int<0x69, "punpckhwd", + int_x86_mmx_punpckhwd, + SchedWriteShuffle.MMX>; +defm MMX_PUNPCKHDQ : MMXI_binop_rm_int<0x6A, "punpckhdq", + int_x86_mmx_punpckhdq, + SchedWriteShuffle.MMX>; +defm MMX_PUNPCKLBW : MMXI_binop_rm_int<0x60, "punpcklbw", + int_x86_mmx_punpcklbw, + SchedWriteShuffle.MMX, + 0, i32mem>; +defm MMX_PUNPCKLWD : MMXI_binop_rm_int<0x61, "punpcklwd", + int_x86_mmx_punpcklwd, + SchedWriteShuffle.MMX, + 0, i32mem>; +defm MMX_PUNPCKLDQ : MMXI_binop_rm_int<0x62, "punpckldq", + int_x86_mmx_punpckldq, + SchedWriteShuffle.MMX, + 0, i32mem>; + +// -- Pack Instructions +defm MMX_PACKSSWB : MMXI_binop_rm_int<0x63, "packsswb", int_x86_mmx_packsswb, + SchedWriteShuffle.MMX>; +defm MMX_PACKSSDW : MMXI_binop_rm_int<0x6B, "packssdw", int_x86_mmx_packssdw, + SchedWriteShuffle.MMX>; +defm MMX_PACKUSWB : MMXI_binop_rm_int<0x67, "packuswb", int_x86_mmx_packuswb, + SchedWriteShuffle.MMX>; + +// -- Shuffle Instructions +defm MMX_PSHUFB : SS3I_binop_rm_int_mm<0x00, "pshufb", int_x86_ssse3_pshuf_b, + SchedWriteVarShuffle.MMX>; + +def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg, + (outs VR64:$dst), (ins VR64:$src1, u8imm:$src2), + "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR64:$dst, + (int_x86_sse_pshuf_w VR64:$src1, imm:$src2))]>, + Sched<[SchedWriteShuffle.MMX]>; +def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem, + (outs VR64:$dst), (ins i64mem:$src1, u8imm:$src2), + "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR64:$dst, + (int_x86_sse_pshuf_w (load_mmx addr:$src1), + imm:$src2))]>, + Sched<[SchedWriteShuffle.MMX.Folded]>; + +// -- Conversion Instructions +defm MMX_CVTPS2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtps2pi, + f64mem, load, "cvtps2pi\t{$src, $dst|$dst, $src}", + WriteCvtPS2I, SSEPackedSingle>, PS; +defm MMX_CVTPD2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtpd2pi, + f128mem, memop, "cvtpd2pi\t{$src, $dst|$dst, $src}", + WriteCvtPD2I, SSEPackedDouble>, PD; +defm MMX_CVTTPS2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttps2pi, + f64mem, load, "cvttps2pi\t{$src, $dst|$dst, $src}", + WriteCvtPS2I, SSEPackedSingle>, PS; +defm MMX_CVTTPD2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttpd2pi, + f128mem, memop, "cvttpd2pi\t{$src, $dst|$dst, $src}", + WriteCvtPD2I, SSEPackedDouble>, PD; +defm MMX_CVTPI2PD : sse12_cvt_pint<0x2A, VR64, VR128, int_x86_sse_cvtpi2pd, + i64mem, load, "cvtpi2pd\t{$src, $dst|$dst, $src}", + WriteCvtI2PD, SSEPackedDouble>, PD; +let Constraints = "$src1 = $dst" in { + defm MMX_CVTPI2PS : sse12_cvt_pint_3addr<0x2A, VR64, VR128, + int_x86_sse_cvtpi2ps, + i64mem, load, "cvtpi2ps\t{$src2, $dst|$dst, $src2}", + SSEPackedSingle>, PS; +} + +// Extract / Insert +let Predicates = [HasMMX, HasSSE1] in +def MMX_PEXTRWrr: MMXIi8<0xC5, MRMSrcReg, + (outs GR32orGR64:$dst), (ins VR64:$src1, i32u8imm:$src2), + "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32orGR64:$dst, (int_x86_mmx_pextr_w VR64:$src1, + imm:$src2))]>, + Sched<[WriteVecExtract]>; +let Constraints = "$src1 = $dst" in { +let Predicates = [HasMMX, HasSSE1] in { + def MMX_PINSRWrr : MMXIi8<0xC4, MRMSrcReg, + (outs VR64:$dst), + (ins VR64:$src1, GR32orGR64:$src2, i32u8imm:$src3), + "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1, + GR32orGR64:$src2, imm:$src3))]>, + Sched<[WriteVecInsert]>; + + def MMX_PINSRWrm : MMXIi8<0xC4, MRMSrcMem, + (outs VR64:$dst), + (ins VR64:$src1, i16mem:$src2, i32u8imm:$src3), + "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1, + (i32 (anyext (loadi16 addr:$src2))), + imm:$src3))]>, + Sched<[WriteVecInsertLd, ReadAfterLd]>; +} +} + +// Mask creation +let Predicates = [HasMMX, HasSSE1] in +def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), + (ins VR64:$src), + "pmovmskb\t{$src, $dst|$dst, $src}", + [(set GR32orGR64:$dst, + (int_x86_mmx_pmovmskb VR64:$src))]>, + Sched<[WriteMMXMOVMSK]>; + +// Low word of XMM to MMX. +def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1, + [SDTCisVT<0, x86mmx>, SDTCisVT<1, v2i64>]>>; + +def : Pat<(x86mmx (MMX_X86movdq2q VR128:$src)), + (x86mmx (MMX_MOVDQ2Qrr VR128:$src))>; + +def : Pat<(x86mmx (MMX_X86movdq2q (loadv2i64 addr:$src))), + (x86mmx (MMX_MOVQ64rm addr:$src))>; + +// Misc. +let SchedRW = [SchedWriteShuffle.MMX] in { +let Uses = [EDI], Predicates = [HasMMX, HasSSE1,Not64BitMode] in +def MMX_MASKMOVQ : MMXI32<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask), + "maskmovq\t{$mask, $src|$src, $mask}", + [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)]>; +let Uses = [RDI], Predicates = [HasMMX, HasSSE1,In64BitMode] in +def MMX_MASKMOVQ64: MMXI64<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask), + "maskmovq\t{$mask, $src|$src, $mask}", + [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, RDI)]>; +} + +// 64-bit bit convert. +let Predicates = [HasMMX, HasSSE2] in { +def : Pat<(f64 (bitconvert (x86mmx VR64:$src))), + (MMX_MOVQ2FR64rr VR64:$src)>; +def : Pat<(x86mmx (bitconvert (f64 FR64:$src))), + (MMX_MOVFR642Qrr FR64:$src)>; +def : Pat<(x86mmx (MMX_X86movdq2q + (bc_v2i64 (v4i32 (X86cvtp2Int (v4f32 VR128:$src)))))), + (MMX_CVTPS2PIirr VR128:$src)>; +def : Pat<(x86mmx (MMX_X86movdq2q + (bc_v2i64 (v4i32 (X86cvttp2si (v4f32 VR128:$src)))))), + (MMX_CVTTPS2PIirr VR128:$src)>; +def : Pat<(x86mmx (MMX_X86movdq2q + (bc_v2i64 (v4i32 (fp_to_sint (v4f32 VR128:$src)))))), + (MMX_CVTTPS2PIirr VR128:$src)>; +def : Pat<(x86mmx (MMX_X86movdq2q + (bc_v2i64 (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))), + (MMX_CVTPD2PIirr VR128:$src)>; +def : Pat<(x86mmx (MMX_X86movdq2q + (bc_v2i64 (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))), + (MMX_CVTTPD2PIirr VR128:$src)>; +} diff --git a/capstone/suite/synctools/tablegen/X86/back/X86InstrMPX.td b/capstone/suite/synctools/tablegen/X86/back/X86InstrMPX.td new file mode 100644 index 000000000..c1a8cc7c5 --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86InstrMPX.td @@ -0,0 +1,80 @@ +//===-- X86InstrMPX.td - MPX Instruction Set ---------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 MPX instruction set, defining the +// instructions, and properties of the instructions which are needed for code +// generation, machine code emission, and analysis. +// +//===----------------------------------------------------------------------===// + +// FIXME: Investigate a better scheduler class once MPX is used inside LLVM. +let SchedRW = [WriteSystem] in { + +multiclass mpx_bound_make<bits<8> opc, string OpcodeStr> { + def 32rm: I<opc, MRMSrcMem, (outs BNDR:$dst), (ins anymem:$src), + OpcodeStr#"\t{$src, $dst|$dst, $src}", []>, + Requires<[HasMPX, Not64BitMode]>; + def 64rm: I<opc, MRMSrcMem, (outs BNDR:$dst), (ins anymem:$src), + OpcodeStr#"\t{$src, $dst|$dst, $src}", []>, + Requires<[HasMPX, In64BitMode]>; +} + +defm BNDMK : mpx_bound_make<0x1B, "bndmk">, XS; + +multiclass mpx_bound_check<bits<8> opc, string OpcodeStr> { + def 32rm: I<opc, MRMSrcMem, (outs), (ins BNDR:$src1, anymem:$src2), + OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>, + Requires<[HasMPX, Not64BitMode]>; + def 64rm: I<opc, MRMSrcMem, (outs), (ins BNDR:$src1, anymem:$src2), + OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>, + Requires<[HasMPX, In64BitMode]>; + + def 32rr: I<opc, MRMSrcReg, (outs), (ins BNDR:$src1, GR32:$src2), + OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>, + Requires<[HasMPX, Not64BitMode]>; + def 64rr: I<opc, MRMSrcReg, (outs), (ins BNDR:$src1, GR64:$src2), + OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>, + Requires<[HasMPX, In64BitMode]>; +} +defm BNDCL : mpx_bound_check<0x1A, "bndcl">, XS, NotMemoryFoldable; +defm BNDCU : mpx_bound_check<0x1A, "bndcu">, XD, NotMemoryFoldable; +defm BNDCN : mpx_bound_check<0x1B, "bndcn">, XD, NotMemoryFoldable; + +def BNDMOVrr : I<0x1A, MRMSrcReg, (outs BNDR:$dst), (ins BNDR:$src), + "bndmov\t{$src, $dst|$dst, $src}", []>, PD, + Requires<[HasMPX]>, NotMemoryFoldable; +let mayLoad = 1 in { +def BNDMOV32rm : I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src), + "bndmov\t{$src, $dst|$dst, $src}", []>, PD, + Requires<[HasMPX, Not64BitMode]>, NotMemoryFoldable; +def BNDMOV64rm : I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i128mem:$src), + "bndmov\t{$src, $dst|$dst, $src}", []>, PD, + Requires<[HasMPX, In64BitMode]>, NotMemoryFoldable; +} +let isCodeGenOnly = 1, ForceDisassemble = 1 in +def BNDMOVrr_REV : I<0x1B, MRMDestReg, (outs BNDR:$dst), (ins BNDR:$src), + "bndmov\t{$src, $dst|$dst, $src}", []>, PD, + Requires<[HasMPX]>, NotMemoryFoldable; +let mayStore = 1 in { +def BNDMOV32mr : I<0x1B, MRMDestMem, (outs), (ins i64mem:$dst, BNDR:$src), + "bndmov\t{$src, $dst|$dst, $src}", []>, PD, + Requires<[HasMPX, Not64BitMode]>, NotMemoryFoldable; +def BNDMOV64mr : I<0x1B, MRMDestMem, (outs), (ins i128mem:$dst, BNDR:$src), + "bndmov\t{$src, $dst|$dst, $src}", []>, PD, + Requires<[HasMPX, In64BitMode]>, NotMemoryFoldable; + +def BNDSTXmr: I<0x1B, MRMDestMem, (outs), (ins anymem:$dst, BNDR:$src), + "bndstx\t{$src, $dst|$dst, $src}", []>, PS, + Requires<[HasMPX]>; +} +let mayLoad = 1 in +def BNDLDXrm: I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins anymem:$src), + "bndldx\t{$src, $dst|$dst, $src}", []>, PS, + Requires<[HasMPX]>; +} // SchedRW diff --git a/capstone/suite/synctools/tablegen/X86/back/X86InstrSGX.td b/capstone/suite/synctools/tablegen/X86/back/X86InstrSGX.td new file mode 100644 index 000000000..488cc4438 --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86InstrSGX.td @@ -0,0 +1,30 @@ +//===-- X86InstrSGX.td - SGX Instruction Set Extension -----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the instructions that make up the Intel SGX instruction +// set. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// SGX instructions + +let SchedRW = [WriteSystem], Predicates = [HasSGX] in { +// ENCLS - Execute an Enclave System Function of Specified Leaf Number +def ENCLS : I<0x01, MRM_CF, (outs), (ins), + "encls", []>, TB; + +// ENCLU - Execute an Enclave User Function of Specified Leaf Number +def ENCLU : I<0x01, MRM_D7, (outs), (ins), + "enclu", []>, TB; + +// ENCLV - Execute an Enclave VMM Function of Specified Leaf Number +def ENCLV : I<0x01, MRM_C0, (outs), (ins), + "enclv", []>, TB; +} // SchedRW diff --git a/capstone/suite/synctools/tablegen/X86/back/X86InstrSSE.td b/capstone/suite/synctools/tablegen/X86/back/X86InstrSSE.td new file mode 100644 index 000000000..910b80636 --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86InstrSSE.td @@ -0,0 +1,8256 @@ +//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 SSE instruction set, defining the instructions, +// and properties of the instructions which are needed for code generation, +// machine code emission, and analysis. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 Instructions Classes +//===----------------------------------------------------------------------===// + +/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class +multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode, + RegisterClass RC, X86MemOperand x86memop, + Domain d, X86FoldableSchedWrite sched, + bit Is2Addr = 1> { + let isCommutable = 1 in { + def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], d>, + Sched<[sched]>; + } + def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], d>, + Sched<[sched.Folded, ReadAfterLd]>; +} + +/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class +multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, + SDPatternOperator OpNode, RegisterClass RC, + ValueType VT, string asm, Operand memopr, + ComplexPattern mem_cpat, Domain d, + X86FoldableSchedWrite sched, bit Is2Addr = 1> { +let isCodeGenOnly = 1, hasSideEffects = 0 in { + def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (VT (OpNode RC:$src1, RC:$src2)))], d>, + Sched<[sched]>; + let mayLoad = 1 in + def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2), + !if(Is2Addr, + !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (VT (OpNode RC:$src1, mem_cpat:$src2)))], d>, + Sched<[sched.Folded, ReadAfterLd]>; +} +} + +/// sse12_fp_packed - SSE 1 & 2 packed instructions class +multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, + RegisterClass RC, ValueType vt, + X86MemOperand x86memop, PatFrag mem_frag, + Domain d, X86FoldableSchedWrite sched, + bit Is2Addr = 1> { + let isCommutable = 1 in + def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], d>, + Sched<[sched]>; + let mayLoad = 1 in + def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))], + d>, + Sched<[sched.Folded, ReadAfterLd]>; +} + +/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class +multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d, + string OpcodeStr, X86MemOperand x86memop, + X86FoldableSchedWrite sched, + list<dag> pat_rr, list<dag> pat_rm, + bit Is2Addr = 1> { + let isCommutable = 1, hasSideEffects = 0 in + def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + pat_rr, d>, + Sched<[sched]>; + let hasSideEffects = 0, mayLoad = 1 in + def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + pat_rm, d>, + Sched<[sched.Folded, ReadAfterLd]>; +} + + +/* +// Alias instructions that map fld0 to xorps for sse or vxorps for avx. +// This is expanded by ExpandPostRAPseudos. +let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, + isPseudo = 1, SchedRW = [WriteZero] in { + def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "", + [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>; + def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "", + [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2, NoAVX512]>; +} +*/ + +//===----------------------------------------------------------------------===// +// AVX & SSE - Zero/One Vectors +//===----------------------------------------------------------------------===// + +// Alias instruction that maps zero vector to pxor / xorp* for sse. +// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then +// swizzled by ExecutionDomainFix to pxor. +// We set canFoldAsLoad because this can be converted to a constant-pool +// load of an all-zeros value if folding it would be beneficial. +let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, + isPseudo = 1, SchedRW = [WriteZero] in { +def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "", + [(set VR128:$dst, (v4f32 immAllZerosV))]>; +} + +let Predicates = [NoAVX512] in +def : Pat<(v4i32 immAllZerosV), (V_SET0)>; + + +// The same as done above but for AVX. The 256-bit AVX1 ISA doesn't support PI, +// and doesn't need it because on sandy bridge the register is set to zero +// at the rename stage without using any execution unit, so SET0PSY +// and SET0PDY can be used for vector int instructions without penalty +let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, + isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in { +def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "", + [(set VR256:$dst, (v8i32 immAllZerosV))]>; +} + +// We set canFoldAsLoad because this can be converted to a constant-pool +// load of an all-ones value if folding it would be beneficial. +let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, + isPseudo = 1, SchedRW = [WriteZero] in { + def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "", + [(set VR128:$dst, (v4i32 immAllOnesV))]>; + let Predicates = [HasAVX1Only, OptForMinSize] in { + def AVX1_SETALLONES: I<0, Pseudo, (outs VR256:$dst), (ins), "", + [(set VR256:$dst, (v8i32 immAllOnesV))]>; + } + let Predicates = [HasAVX2] in + def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "", + [(set VR256:$dst, (v8i32 immAllOnesV))]>; +} + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Move FP Scalar Instructions +// +// Move Instructions. Register-to-register movss/movsd is not used for FR32/64 +// register copies because it's a partial register update; Register-to-register +// movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires +// that the insert be implementable in terms of a copy, and just mentioned, we +// don't use movss/movsd for copies. +//===----------------------------------------------------------------------===// + +multiclass sse12_move_rr<SDNode OpNode, ValueType vt, + X86MemOperand x86memop, string base_opc, + string asm_opr, Domain d, string Name> { + let isCommutable = 1 in + def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !strconcat(base_opc, asm_opr), + [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))], d>, + Sched<[SchedWriteFShuffle.XMM]>; + + // For the disassembler + let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in + def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !strconcat(base_opc, asm_opr), []>, + Sched<[SchedWriteFShuffle.XMM]>, FoldGenData<Name#rr>; +} + +multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt, + X86MemOperand x86memop, string OpcodeStr, + Domain d, string Name, Predicate pred> { + // AVX + let Predicates = [UseAVX, OptForSize] in + defm V#NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d, + "V"#Name>, + VEX_4V, VEX_LIG, VEX_WIG; + + def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(store RC:$src, addr:$dst)], d>, + VEX, VEX_LIG, Sched<[WriteFStore]>, VEX_WIG; + // SSE1 & 2 + let Constraints = "$src1 = $dst" in { + let Predicates = [pred, NoSSE41_Or_OptForSize] in + defm NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr, + "\t{$src2, $dst|$dst, $src2}", d, Name>; + } + + def NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(store RC:$src, addr:$dst)], d>, + Sched<[WriteFStore]>; + + def : InstAlias<"v"#OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}", + (!cast<Instruction>("V"#NAME#"rr_REV") + VR128:$dst, VR128:$src1, VR128:$src2), 0>; + def : InstAlias<OpcodeStr#".s\t{$src2, $dst|$dst, $src2}", + (!cast<Instruction>(NAME#"rr_REV") + VR128:$dst, VR128:$src2), 0>; +} + +// Loading from memory automatically zeroing upper bits. +multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop, + PatFrag mem_pat, string OpcodeStr, Domain d> { + def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (mem_pat addr:$src))], d>, + VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG; + def NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (mem_pat addr:$src))], d>, + Sched<[WriteFLoad]>; +} + +defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss", + SSEPackedSingle, "MOVSS", UseSSE1>, XS; +defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd", + SSEPackedDouble, "MOVSD", UseSSE2>, XD; + +let canFoldAsLoad = 1, isReMaterializable = 1 in { + defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss", + SSEPackedSingle>, XS; + defm MOVSD : sse12_move_rm<FR64, f64mem, loadf64, "movsd", + SSEPackedDouble>, XD; +} + +// Patterns +let Predicates = [UseAVX] in { + // MOVSSrm zeros the high parts of the register; represent this + // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 + def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))), + (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>; + def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))), + (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>; + def : Pat<(v4f32 (X86vzload addr:$src)), + (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>; + + // MOVSDrm zeros the high parts of the register; represent this + // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0 + def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))), + (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; + def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))), + (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; + def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))), + (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; + def : Pat<(v2f64 (X86vzload addr:$src)), + (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>; + + // Represent the same patterns above but in the form they appear for + // 256-bit types + def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, + (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>; + def : Pat<(v8f32 (X86vzload addr:$src)), + (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>; + def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, + (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>; + def : Pat<(v4f64 (X86vzload addr:$src)), + (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>; + + // Extract and store. + def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))), + addr:$dst), + (VMOVSSmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32))>; +} + +let Predicates = [UseAVX, OptForSize] in { + // Move scalar to XMM zero-extended, zeroing a VR128 then do a + // MOVSS to the lower bits. + def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), + (VMOVSSrr (v4f32 (V_SET0)), VR128:$src)>; + def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), + (VMOVSSrr (v4i32 (V_SET0)), VR128:$src)>; + + // Move low f32 and clear high bits. + def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), + (SUBREG_TO_REG (i32 0), + (v4f32 (VMOVSSrr (v4f32 (V_SET0)), + (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)))), sub_xmm)>; + def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), + (SUBREG_TO_REG (i32 0), + (v4i32 (VMOVSSrr (v4i32 (V_SET0)), + (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>; + + def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), + (SUBREG_TO_REG (i32 0), + (v2f64 (VMOVSDrr (v2f64 (V_SET0)), + (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))), + sub_xmm)>; + def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), + (SUBREG_TO_REG (i32 0), + (v2i64 (VMOVSDrr (v2i64 (V_SET0)), + (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))), + sub_xmm)>; +} + +let Predicates = [UseSSE1] in { + let Predicates = [UseSSE1, NoSSE41_Or_OptForSize] in { + // Move scalar to XMM zero-extended, zeroing a VR128 then do a + // MOVSS to the lower bits. + def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), + (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>; + def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), + (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>; + } + + // MOVSSrm already zeros the high parts of the register. + def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))), + (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>; + def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))), + (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>; + def : Pat<(v4f32 (X86vzload addr:$src)), + (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>; + + // Extract and store. + def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))), + addr:$dst), + (MOVSSmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR32))>; +} + +let Predicates = [UseSSE2] in { + // MOVSDrm already zeros the high parts of the register. + def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))), + (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; + def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))), + (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; + def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))), + (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; + def : Pat<(v2f64 (X86vzload addr:$src)), + (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>; +} + +// Aliases to help the assembler pick two byte VEX encodings by swapping the +// operands relative to the normal instructions to use VEX.R instead of VEX.B. +def : InstAlias<"vmovss\t{$src2, $src1, $dst|$dst, $src1, $src2}", + (VMOVSSrr_REV VR128L:$dst, VR128:$src1, VR128H:$src2), 0>; +def : InstAlias<"vmovsd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + (VMOVSDrr_REV VR128L:$dst, VR128:$src1, VR128H:$src2), 0>; + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions +//===----------------------------------------------------------------------===// + +multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC, + X86MemOperand x86memop, PatFrag ld_frag, + string asm, Domain d, + X86SchedWriteMoveLS sched> { +let hasSideEffects = 0, isMoveReg = 1 in + def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>, + Sched<[sched.RR]>; +let canFoldAsLoad = 1, isReMaterializable = 1 in + def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (ld_frag addr:$src))], d>, + Sched<[sched.RM]>; +} + +let Predicates = [HasAVX, NoVLX] in { +defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", + SSEPackedSingle, SchedWriteFMoveLS.XMM>, + PS, VEX, VEX_WIG; +defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd", + SSEPackedDouble, SchedWriteFMoveLS.XMM>, + PD, VEX, VEX_WIG; +defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups", + SSEPackedSingle, SchedWriteFMoveLS.XMM>, + PS, VEX, VEX_WIG; +defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd", + SSEPackedDouble, SchedWriteFMoveLS.XMM>, + PD, VEX, VEX_WIG; + +defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, "movaps", + SSEPackedSingle, SchedWriteFMoveLS.YMM>, + PS, VEX, VEX_L, VEX_WIG; +defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd", + SSEPackedDouble, SchedWriteFMoveLS.YMM>, + PD, VEX, VEX_L, VEX_WIG; +defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups", + SSEPackedSingle, SchedWriteFMoveLS.YMM>, + PS, VEX, VEX_L, VEX_WIG; +defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd", + SSEPackedDouble, SchedWriteFMoveLS.YMM>, + PD, VEX, VEX_L, VEX_WIG; +} + +let Predicates = [UseSSE1] in { +defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps", + SSEPackedSingle, SchedWriteFMoveLS.XMM>, + PS; +defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups", + SSEPackedSingle, SchedWriteFMoveLS.XMM>, + PS; +} +let Predicates = [UseSSE2] in { +defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd", + SSEPackedDouble, SchedWriteFMoveLS.XMM>, + PD; +defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd", + SSEPackedDouble, SchedWriteFMoveLS.XMM>, + PD; +} + +let Predicates = [HasAVX, NoVLX] in { +let SchedRW = [SchedWriteFMoveLS.XMM.MR] in { +def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movaps\t{$src, $dst|$dst, $src}", + [(alignedstore (v4f32 VR128:$src), addr:$dst)]>, + VEX, VEX_WIG; +def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movapd\t{$src, $dst|$dst, $src}", + [(alignedstore (v2f64 VR128:$src), addr:$dst)]>, + VEX, VEX_WIG; +def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movups\t{$src, $dst|$dst, $src}", + [(store (v4f32 VR128:$src), addr:$dst)]>, + VEX, VEX_WIG; +def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movupd\t{$src, $dst|$dst, $src}", + [(store (v2f64 VR128:$src), addr:$dst)]>, + VEX, VEX_WIG; +} // SchedRW + +let SchedRW = [SchedWriteFMoveLS.YMM.MR] in { +def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), + "movaps\t{$src, $dst|$dst, $src}", + [(alignedstore (v8f32 VR256:$src), addr:$dst)]>, + VEX, VEX_L, VEX_WIG; +def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), + "movapd\t{$src, $dst|$dst, $src}", + [(alignedstore (v4f64 VR256:$src), addr:$dst)]>, + VEX, VEX_L, VEX_WIG; +def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), + "movups\t{$src, $dst|$dst, $src}", + [(store (v8f32 VR256:$src), addr:$dst)]>, + VEX, VEX_L, VEX_WIG; +def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), + "movupd\t{$src, $dst|$dst, $src}", + [(store (v4f64 VR256:$src), addr:$dst)]>, + VEX, VEX_L, VEX_WIG; +} // SchedRW +} // Predicate + +// For disassembler +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, + isMoveReg = 1 in { +let SchedRW = [SchedWriteFMoveLS.XMM.RR] in { + def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst), + (ins VR128:$src), + "movaps\t{$src, $dst|$dst, $src}", []>, + VEX, VEX_WIG, FoldGenData<"VMOVAPSrr">; + def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst), + (ins VR128:$src), + "movapd\t{$src, $dst|$dst, $src}", []>, + VEX, VEX_WIG, FoldGenData<"VMOVAPDrr">; + def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst), + (ins VR128:$src), + "movups\t{$src, $dst|$dst, $src}", []>, + VEX, VEX_WIG, FoldGenData<"VMOVUPSrr">; + def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst), + (ins VR128:$src), + "movupd\t{$src, $dst|$dst, $src}", []>, + VEX, VEX_WIG, FoldGenData<"VMOVUPDrr">; +} // SchedRW + +let SchedRW = [SchedWriteFMoveLS.YMM.RR] in { + def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst), + (ins VR256:$src), + "movaps\t{$src, $dst|$dst, $src}", []>, + VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPSYrr">; + def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst), + (ins VR256:$src), + "movapd\t{$src, $dst|$dst, $src}", []>, + VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVAPDYrr">; + def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst), + (ins VR256:$src), + "movups\t{$src, $dst|$dst, $src}", []>, + VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPSYrr">; + def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst), + (ins VR256:$src), + "movupd\t{$src, $dst|$dst, $src}", []>, + VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVUPDYrr">; +} // SchedRW +} // Predicate + +// Aliases to help the assembler pick two byte VEX encodings by swapping the +// operands relative to the normal instructions to use VEX.R instead of VEX.B. +def : InstAlias<"vmovaps\t{$src, $dst|$dst, $src}", + (VMOVAPSrr_REV VR128L:$dst, VR128H:$src), 0>; +def : InstAlias<"vmovapd\t{$src, $dst|$dst, $src}", + (VMOVAPDrr_REV VR128L:$dst, VR128H:$src), 0>; +def : InstAlias<"vmovups\t{$src, $dst|$dst, $src}", + (VMOVUPSrr_REV VR128L:$dst, VR128H:$src), 0>; +def : InstAlias<"vmovupd\t{$src, $dst|$dst, $src}", + (VMOVUPDrr_REV VR128L:$dst, VR128H:$src), 0>; +def : InstAlias<"vmovaps\t{$src, $dst|$dst, $src}", + (VMOVAPSYrr_REV VR256L:$dst, VR256H:$src), 0>; +def : InstAlias<"vmovapd\t{$src, $dst|$dst, $src}", + (VMOVAPDYrr_REV VR256L:$dst, VR256H:$src), 0>; +def : InstAlias<"vmovups\t{$src, $dst|$dst, $src}", + (VMOVUPSYrr_REV VR256L:$dst, VR256H:$src), 0>; +def : InstAlias<"vmovupd\t{$src, $dst|$dst, $src}", + (VMOVUPDYrr_REV VR256L:$dst, VR256H:$src), 0>; + +// Reversed version with ".s" suffix for GAS compatibility. +def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}", + (VMOVAPSrr_REV VR128:$dst, VR128:$src), 0>; +def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}", + (VMOVAPDrr_REV VR128:$dst, VR128:$src), 0>; +def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}", + (VMOVUPSrr_REV VR128:$dst, VR128:$src), 0>; +def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}", + (VMOVUPDrr_REV VR128:$dst, VR128:$src), 0>; +def : InstAlias<"vmovaps.s\t{$src, $dst|$dst, $src}", + (VMOVAPSYrr_REV VR256:$dst, VR256:$src), 0>; +def : InstAlias<"vmovapd.s\t{$src, $dst|$dst, $src}", + (VMOVAPDYrr_REV VR256:$dst, VR256:$src), 0>; +def : InstAlias<"vmovups.s\t{$src, $dst|$dst, $src}", + (VMOVUPSYrr_REV VR256:$dst, VR256:$src), 0>; +def : InstAlias<"vmovupd.s\t{$src, $dst|$dst, $src}", + (VMOVUPDYrr_REV VR256:$dst, VR256:$src), 0>; + +let SchedRW = [SchedWriteFMoveLS.XMM.MR] in { +def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movaps\t{$src, $dst|$dst, $src}", + [(alignedstore (v4f32 VR128:$src), addr:$dst)]>; +def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movapd\t{$src, $dst|$dst, $src}", + [(alignedstore (v2f64 VR128:$src), addr:$dst)]>; +def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movups\t{$src, $dst|$dst, $src}", + [(store (v4f32 VR128:$src), addr:$dst)]>; +def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movupd\t{$src, $dst|$dst, $src}", + [(store (v2f64 VR128:$src), addr:$dst)]>; +} // SchedRW + +// For disassembler +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, + isMoveReg = 1, SchedRW = [SchedWriteFMoveLS.XMM.RR] in { + def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), + "movaps\t{$src, $dst|$dst, $src}", []>, + FoldGenData<"MOVAPSrr">; + def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), + "movapd\t{$src, $dst|$dst, $src}", []>, + FoldGenData<"MOVAPDrr">; + def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), + "movups\t{$src, $dst|$dst, $src}", []>, + FoldGenData<"MOVUPSrr">; + def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), + "movupd\t{$src, $dst|$dst, $src}", []>, + FoldGenData<"MOVUPDrr">; +} + +// Reversed version with ".s" suffix for GAS compatibility. +def : InstAlias<"movaps.s\t{$src, $dst|$dst, $src}", + (MOVAPSrr_REV VR128:$dst, VR128:$src), 0>; +def : InstAlias<"movapd.s\t{$src, $dst|$dst, $src}", + (MOVAPDrr_REV VR128:$dst, VR128:$src), 0>; +def : InstAlias<"movups.s\t{$src, $dst|$dst, $src}", + (MOVUPSrr_REV VR128:$dst, VR128:$src), 0>; +def : InstAlias<"movupd.s\t{$src, $dst|$dst, $src}", + (MOVUPDrr_REV VR128:$dst, VR128:$src), 0>; + +let Predicates = [HasAVX, NoVLX] in { + // 256-bit load/store need to use floating point load/store in case we don't + // have AVX2. Execution domain fixing will convert to integer if AVX2 is + // available and changing the domain is beneficial. + def : Pat<(alignedloadv4i64 addr:$src), + (VMOVAPSYrm addr:$src)>; + def : Pat<(loadv4i64 addr:$src), + (VMOVUPSYrm addr:$src)>; + def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst), + (VMOVAPSYmr addr:$dst, VR256:$src)>; + def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst), + (VMOVAPSYmr addr:$dst, VR256:$src)>; + def : Pat<(alignedstore (v16i16 VR256:$src), addr:$dst), + (VMOVAPSYmr addr:$dst, VR256:$src)>; + def : Pat<(alignedstore (v32i8 VR256:$src), addr:$dst), + (VMOVAPSYmr addr:$dst, VR256:$src)>; + def : Pat<(store (v4i64 VR256:$src), addr:$dst), + (VMOVUPSYmr addr:$dst, VR256:$src)>; + def : Pat<(store (v8i32 VR256:$src), addr:$dst), + (VMOVUPSYmr addr:$dst, VR256:$src)>; + def : Pat<(store (v16i16 VR256:$src), addr:$dst), + (VMOVUPSYmr addr:$dst, VR256:$src)>; + def : Pat<(store (v32i8 VR256:$src), addr:$dst), + (VMOVUPSYmr addr:$dst, VR256:$src)>; +} + +// Use movaps / movups for SSE integer load / store (one byte shorter). +// The instructions selected below are then converted to MOVDQA/MOVDQU +// during the SSE domain pass. +let Predicates = [UseSSE1] in { + def : Pat<(alignedloadv2i64 addr:$src), + (MOVAPSrm addr:$src)>; + def : Pat<(loadv2i64 addr:$src), + (MOVUPSrm addr:$src)>; + + def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst), + (MOVAPSmr addr:$dst, VR128:$src)>; + def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst), + (MOVAPSmr addr:$dst, VR128:$src)>; + def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst), + (MOVAPSmr addr:$dst, VR128:$src)>; + def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst), + (MOVAPSmr addr:$dst, VR128:$src)>; + def : Pat<(store (v2i64 VR128:$src), addr:$dst), + (MOVUPSmr addr:$dst, VR128:$src)>; + def : Pat<(store (v4i32 VR128:$src), addr:$dst), + (MOVUPSmr addr:$dst, VR128:$src)>; + def : Pat<(store (v8i16 VR128:$src), addr:$dst), + (MOVUPSmr addr:$dst, VR128:$src)>; + def : Pat<(store (v16i8 VR128:$src), addr:$dst), + (MOVUPSmr addr:$dst, VR128:$src)>; +} + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Move Low packed FP Instructions +//===----------------------------------------------------------------------===// + +multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode pdnode, + string base_opc, string asm_opr> { + // No pattern as they need be special cased between high and low. + let hasSideEffects = 0, mayLoad = 1 in + def PSrm : PI<opc, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2), + !strconcat(base_opc, "s", asm_opr), + [], SSEPackedSingle>, PS, + Sched<[SchedWriteFShuffle.XMM.Folded, ReadAfterLd]>; + + def PDrm : PI<opc, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2), + !strconcat(base_opc, "d", asm_opr), + [(set VR128:$dst, (v2f64 (pdnode VR128:$src1, + (scalar_to_vector (loadf64 addr:$src2)))))], + SSEPackedDouble>, PD, + Sched<[SchedWriteFShuffle.XMM.Folded, ReadAfterLd]>; +} + +multiclass sse12_mov_hilo_packed<bits<8>opc, SDPatternOperator pdnode, + string base_opc> { + let Predicates = [UseAVX] in + defm V#NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}">, + VEX_4V, VEX_WIG; + + let Constraints = "$src1 = $dst" in + defm NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc, + "\t{$src2, $dst|$dst, $src2}">; +} + +defm MOVL : sse12_mov_hilo_packed<0x12, X86Movsd, "movlp">; + +let SchedRW = [WriteFStore] in { +let Predicates = [UseAVX] in { +def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), + "movlps\t{$src, $dst|$dst, $src}", + [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)), + (iPTR 0))), addr:$dst)]>, + VEX, VEX_WIG; +def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), + "movlpd\t{$src, $dst|$dst, $src}", + [(store (f64 (extractelt (v2f64 VR128:$src), + (iPTR 0))), addr:$dst)]>, + VEX, VEX_WIG; +}// UseAVX +def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), + "movlps\t{$src, $dst|$dst, $src}", + [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)), + (iPTR 0))), addr:$dst)]>; +def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), + "movlpd\t{$src, $dst|$dst, $src}", + [(store (f64 (extractelt (v2f64 VR128:$src), + (iPTR 0))), addr:$dst)]>; +} // SchedRW + +let Predicates = [UseSSE1] in { + // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS + def : Pat<(store (i64 (extractelt (bc_v2i64 (v4f32 VR128:$src2)), + (iPTR 0))), addr:$src1), + (MOVLPSmr addr:$src1, VR128:$src2)>; + + // This pattern helps select MOVLPS on SSE1 only targets. With SSE2 we'll + // end up with a movsd or blend instead of shufp. + // No need for aligned load, we're only loading 64-bits. + def : Pat<(X86Shufp (loadv4f32 addr:$src2), VR128:$src1, (i8 -28)), + (MOVLPSrm VR128:$src1, addr:$src2)>; +} + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Move Hi packed FP Instructions +//===----------------------------------------------------------------------===// + +defm MOVH : sse12_mov_hilo_packed<0x16, X86Unpckl, "movhp">; + +let SchedRW = [WriteFStore] in { +// v2f64 extract element 1 is always custom lowered to unpack high to low +// and extract element 0 so the non-store version isn't too horrible. +let Predicates = [UseAVX] in { +def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), + "movhps\t{$src, $dst|$dst, $src}", + [(store (f64 (extractelt + (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)), + (bc_v2f64 (v4f32 VR128:$src))), + (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG; +def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), + "movhpd\t{$src, $dst|$dst, $src}", + [(store (f64 (extractelt + (v2f64 (X86Unpckh VR128:$src, VR128:$src)), + (iPTR 0))), addr:$dst)]>, VEX, VEX_WIG; +} // UseAVX +def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), + "movhps\t{$src, $dst|$dst, $src}", + [(store (f64 (extractelt + (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)), + (bc_v2f64 (v4f32 VR128:$src))), + (iPTR 0))), addr:$dst)]>; +def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), + "movhpd\t{$src, $dst|$dst, $src}", + [(store (f64 (extractelt + (v2f64 (X86Unpckh VR128:$src, VR128:$src)), + (iPTR 0))), addr:$dst)]>; +} // SchedRW + +let Predicates = [UseAVX] in { + // Also handle an i64 load because that may get selected as a faster way to + // load the data. + def : Pat<(v2f64 (X86Unpckl VR128:$src1, + (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), + (VMOVHPDrm VR128:$src1, addr:$src2)>; + + def : Pat<(store (f64 (extractelt + (v2f64 (X86VPermilpi VR128:$src, (i8 1))), + (iPTR 0))), addr:$dst), + (VMOVHPDmr addr:$dst, VR128:$src)>; +} + +let Predicates = [UseSSE1] in { + // This pattern helps select MOVHPS on SSE1 only targets. With SSE2 we'll + // end up with a movsd or blend instead of shufp. + // No need for aligned load, we're only loading 64-bits. + def : Pat<(X86Movlhps VR128:$src1, (loadv4f32 addr:$src2)), + (MOVHPSrm VR128:$src1, addr:$src2)>; +} + +let Predicates = [UseSSE2] in { + // MOVHPD patterns + + // Also handle an i64 load because that may get selected as a faster way to + // load the data. + def : Pat<(v2f64 (X86Unpckl VR128:$src1, + (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), + (MOVHPDrm VR128:$src1, addr:$src2)>; + + def : Pat<(store (f64 (extractelt + (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))), + (iPTR 0))), addr:$dst), + (MOVHPDmr addr:$dst, VR128:$src)>; +} + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions +//===----------------------------------------------------------------------===// + +let Predicates = [UseAVX] in { + def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, + (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>, + VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG; + let isCommutable = 1 in + def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, + (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>, + VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG, + NotMemoryFoldable; +} +let Constraints = "$src1 = $dst" in { + def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + "movlhps\t{$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>, + Sched<[SchedWriteFShuffle.XMM]>; + let isCommutable = 1 in + def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + "movhlps\t{$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>, + Sched<[SchedWriteFShuffle.XMM]>, NotMemoryFoldable; +} + +// TODO: This is largely to trick fastisel into ignoring the pattern. +def UnpckhUnary : PatFrag<(ops node:$src1, node:$src2), + (X86Unpckh node:$src1, node:$src2), [{ + return N->getOperand(0) == N->getOperand(1); +}]>; + +let Predicates = [UseSSE2] in { + // TODO: This is a hack pattern to allow lowering to emit unpckh instead of + // movhlps for sse2 without changing a bunch of tests. + def : Pat<(v2f64 (UnpckhUnary VR128:$src, VR128:$src)), + (MOVHLPSrr VR128:$src, VR128:$src)>; +} + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Conversion Instructions +//===----------------------------------------------------------------------===// + +multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, + SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag, + string asm, X86FoldableSchedWrite sched> { + def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm, + [(set DstRC:$dst, (OpNode SrcRC:$src))]>, + Sched<[sched]>; + def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm, + [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>, + Sched<[sched.Folded]>; +} + +multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop, + ValueType DstTy, ValueType SrcTy, PatFrag ld_frag, + string asm, Domain d, X86FoldableSchedWrite sched> { +let hasSideEffects = 0 in { + def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), asm, + [(set RC:$dst, (DstTy (sint_to_fp (SrcTy RC:$src))))], d>, + Sched<[sched]>; + let mayLoad = 1 in + def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm, + [(set RC:$dst, (DstTy (sint_to_fp + (SrcTy (bitconvert (ld_frag addr:$src))))))], d>, + Sched<[sched.Folded]>; +} +} + +multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, + X86MemOperand x86memop, string asm, + X86FoldableSchedWrite sched> { +let hasSideEffects = 0, Predicates = [UseAVX] in { + def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src), + !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, + Sched<[sched]>; + let mayLoad = 1 in + def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), + (ins DstRC:$src1, x86memop:$src), + !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, + Sched<[sched.Folded, ReadAfterLd]>; +} // hasSideEffects = 0 +} + +let Predicates = [UseAVX] in { +defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32, + "cvttss2si\t{$src, $dst|$dst, $src}", + WriteCvtSS2I>, + XS, VEX, VEX_LIG; +defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32, + "cvttss2si\t{$src, $dst|$dst, $src}", + WriteCvtSS2I>, + XS, VEX, VEX_W, VEX_LIG; +defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64, + "cvttsd2si\t{$src, $dst|$dst, $src}", + WriteCvtSD2I>, + XD, VEX, VEX_LIG; +defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64, + "cvttsd2si\t{$src, $dst|$dst, $src}", + WriteCvtSD2I>, + XD, VEX, VEX_W, VEX_LIG; + +def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}", + (VCVTTSS2SIrr GR32:$dst, FR32:$src), 0, "att">; +def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}", + (VCVTTSS2SIrm GR32:$dst, f32mem:$src), 0, "att">; +def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}", + (VCVTTSD2SIrr GR32:$dst, FR64:$src), 0, "att">; +def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}", + (VCVTTSD2SIrm GR32:$dst, f64mem:$src), 0, "att">; +def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}", + (VCVTTSS2SI64rr GR64:$dst, FR32:$src), 0, "att">; +def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}", + (VCVTTSS2SI64rm GR64:$dst, f32mem:$src), 0, "att">; +def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", + (VCVTTSD2SI64rr GR64:$dst, FR64:$src), 0, "att">; +def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", + (VCVTTSD2SI64rm GR64:$dst, f64mem:$src), 0, "att">; +} +// The assembler can recognize rr 64-bit instructions by seeing a rxx +// register, but the same isn't true when only using memory operands, +// provide other assembly "l" and "q" forms to address this explicitly +// where appropriate to do so. +defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss{l}", + WriteCvtI2SS>, XS, VEX_4V, VEX_LIG; +defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}", + WriteCvtI2SS>, XS, VEX_4V, VEX_W, VEX_LIG; +defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}", + WriteCvtI2SD>, XD, VEX_4V, VEX_LIG; +defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}", + WriteCvtI2SD>, XD, VEX_4V, VEX_W, VEX_LIG; + +let Predicates = [UseAVX] in { + def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}", + (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src), 0, "att">; + def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}", + (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src), 0, "att">; + + def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))), + (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>; + def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))), + (VCVTSI642SSrm (f32 (IMPLICIT_DEF)), addr:$src)>; + def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))), + (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>; + def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))), + (VCVTSI642SDrm (f64 (IMPLICIT_DEF)), addr:$src)>; + + def : Pat<(f32 (sint_to_fp GR32:$src)), + (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>; + def : Pat<(f32 (sint_to_fp GR64:$src)), + (VCVTSI642SSrr (f32 (IMPLICIT_DEF)), GR64:$src)>; + def : Pat<(f64 (sint_to_fp GR32:$src)), + (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>; + def : Pat<(f64 (sint_to_fp GR64:$src)), + (VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>; +} + +defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32, + "cvttss2si\t{$src, $dst|$dst, $src}", + WriteCvtSS2I>, XS; +defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32, + "cvttss2si\t{$src, $dst|$dst, $src}", + WriteCvtSS2I>, XS, REX_W; +defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64, + "cvttsd2si\t{$src, $dst|$dst, $src}", + WriteCvtSD2I>, XD; +defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64, + "cvttsd2si\t{$src, $dst|$dst, $src}", + WriteCvtSD2I>, XD, REX_W; +defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32, + "cvtsi2ss{l}\t{$src, $dst|$dst, $src}", + WriteCvtI2SS>, XS; +defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64, + "cvtsi2ss{q}\t{$src, $dst|$dst, $src}", + WriteCvtI2SS>, XS, REX_W; +defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32, + "cvtsi2sd{l}\t{$src, $dst|$dst, $src}", + WriteCvtI2SD>, XD; +defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64, + "cvtsi2sd{q}\t{$src, $dst|$dst, $src}", + WriteCvtI2SD>, XD, REX_W; + +def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}", + (CVTTSS2SIrr GR32:$dst, FR32:$src), 0, "att">; +def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}", + (CVTTSS2SIrm GR32:$dst, f32mem:$src), 0, "att">; +def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}", + (CVTTSD2SIrr GR32:$dst, FR64:$src), 0, "att">; +def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}", + (CVTTSD2SIrm GR32:$dst, f64mem:$src), 0, "att">; +def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}", + (CVTTSS2SI64rr GR64:$dst, FR32:$src), 0, "att">; +def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}", + (CVTTSS2SI64rm GR64:$dst, f32mem:$src), 0, "att">; +def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", + (CVTTSD2SI64rr GR64:$dst, FR64:$src), 0, "att">; +def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}", + (CVTTSD2SI64rm GR64:$dst, f64mem:$src), 0, "att">; + +def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}", + (CVTSI2SSrm FR64:$dst, i32mem:$src), 0, "att">; +def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}", + (CVTSI2SDrm FR64:$dst, i32mem:$src), 0, "att">; + +// Conversion Instructions Intrinsics - Match intrinsics which expect MM +// and/or XMM operand(s). + +// FIXME: We probably want to match the rm form only when optimizing for +// size, to avoid false depenendecies (see sse_fp_unop_s for details) +multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, + Intrinsic Int, Operand memop, ComplexPattern mem_cpat, + string asm, X86FoldableSchedWrite sched> { + def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), + [(set DstRC:$dst, (Int SrcRC:$src))]>, + Sched<[sched]>; + def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), + [(set DstRC:$dst, (Int mem_cpat:$src))]>, + Sched<[sched.Folded]>; +} + +multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC, + RegisterClass DstRC, X86MemOperand x86memop, + string asm, X86FoldableSchedWrite sched, + bit Is2Addr = 1> { +let hasSideEffects = 0 in { + def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2), + !if(Is2Addr, + !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + []>, Sched<[sched]>; + let mayLoad = 1 in + def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), + (ins DstRC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + []>, Sched<[sched.Folded, ReadAfterLd]>; +} +} + +let Predicates = [UseAVX] in { +defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, + int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si", + WriteCvtSD2I>, XD, VEX, VEX_LIG; +defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, + int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si", + WriteCvtSD2I>, XD, VEX, VEX_W, VEX_LIG; +} +defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si, + sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I>, XD; +defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64, + sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I>, XD, REX_W; + + +let isCodeGenOnly = 1 in { + let Predicates = [UseAVX] in { + defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, + i32mem, "cvtsi2ss{l}", WriteCvtI2SS, 0>, XS, VEX_4V; + defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128, + i64mem, "cvtsi2ss{q}", WriteCvtI2SS, 0>, XS, VEX_4V, VEX_W; + defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, + i32mem, "cvtsi2sd{l}", WriteCvtI2SD, 0>, XD, VEX_4V; + defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128, + i64mem, "cvtsi2sd{q}", WriteCvtI2SD, 0>, XD, VEX_4V, VEX_W; + } + let Constraints = "$src1 = $dst" in { + defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, + i32mem, "cvtsi2ss{l}", WriteCvtI2SS>, XS; + defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128, + i64mem, "cvtsi2ss{q}", WriteCvtI2SS>, XS, REX_W; + defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, + i32mem, "cvtsi2sd{l}", WriteCvtI2SD>, XD; + defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128, + i64mem, "cvtsi2sd{q}", WriteCvtI2SD>, XD, REX_W; + } +} // isCodeGenOnly = 1 + +/// SSE 1 Only + +// Aliases for intrinsics +let isCodeGenOnly = 1 in { +let Predicates = [UseAVX] in { +defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si, + ssmem, sse_load_f32, "cvttss2si", + WriteCvtSS2I>, XS, VEX; +defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, + int_x86_sse_cvttss2si64, ssmem, sse_load_f32, + "cvttss2si", WriteCvtSS2I>, + XS, VEX, VEX_W; +defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si, + sdmem, sse_load_f64, "cvttsd2si", + WriteCvtSS2I>, XD, VEX; +defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, + int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64, + "cvttsd2si", WriteCvtSS2I>, + XD, VEX, VEX_W; +} +defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si, + ssmem, sse_load_f32, "cvttss2si", + WriteCvtSS2I>, XS; +defm CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, + int_x86_sse_cvttss2si64, ssmem, sse_load_f32, + "cvttss2si", WriteCvtSS2I>, XS, REX_W; +defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si, + sdmem, sse_load_f64, "cvttsd2si", + WriteCvtSD2I>, XD; +defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, + int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64, + "cvttsd2si", WriteCvtSD2I>, XD, REX_W; +} // isCodeGenOnly = 1 + +let Predicates = [UseAVX] in { +defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si, + ssmem, sse_load_f32, "cvtss2si", + WriteCvtSS2I>, XS, VEX, VEX_LIG; +defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64, + ssmem, sse_load_f32, "cvtss2si", + WriteCvtSS2I>, XS, VEX, VEX_W, VEX_LIG; +} +defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si, + ssmem, sse_load_f32, "cvtss2si", + WriteCvtSS2I>, XS; +defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64, + ssmem, sse_load_f32, "cvtss2si", + WriteCvtSS2I>, XS, REX_W; + +defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, loadv2i64, + "vcvtdq2ps\t{$src, $dst|$dst, $src}", + SSEPackedSingle, WriteCvtI2PS>, + PS, VEX, Requires<[HasAVX, NoVLX]>, VEX_WIG; +defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, loadv4i64, + "vcvtdq2ps\t{$src, $dst|$dst, $src}", + SSEPackedSingle, WriteCvtI2PSY>, + PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, VEX_WIG; + +defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memopv2i64, + "cvtdq2ps\t{$src, $dst|$dst, $src}", + SSEPackedSingle, WriteCvtI2PS>, + PS, Requires<[UseSSE2]>; + +let Predicates = [UseAVX] in { +def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}", + (VCVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; +def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}", + (VCVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">; +def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}", + (VCVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; +def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}", + (VCVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">; +def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}", + (VCVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; +def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}", + (VCVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">; +def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}", + (VCVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; +def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}", + (VCVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">; +} + +def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}", + (CVTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; +def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}", + (CVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0, "att">; +def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}", + (CVTSD2SIrr_Int GR32:$dst, VR128:$src), 0, "att">; +def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}", + (CVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0, "att">; +def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}", + (CVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; +def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}", + (CVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0, "att">; +def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", + (CVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0, "att">; +def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", + (CVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0, "att">; + +/// SSE 2 Only + +// Convert scalar double to scalar single +let hasSideEffects = 0, Predicates = [UseAVX] in { +def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst), + (ins FR32:$src1, FR64:$src2), + "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, + VEX_4V, VEX_LIG, VEX_WIG, + Sched<[WriteCvtSD2SS]>; +let mayLoad = 1 in +def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), + (ins FR32:$src1, f64mem:$src2), + "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, + XD, VEX_4V, VEX_LIG, VEX_WIG, + Sched<[WriteCvtSD2SS.Folded, ReadAfterLd]>; +} + +def : Pat<(f32 (fpround FR64:$src)), + (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>, + Requires<[UseAVX]>; + +def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src), + "cvtsd2ss\t{$src, $dst|$dst, $src}", + [(set FR32:$dst, (fpround FR64:$src))]>, + Sched<[WriteCvtSD2SS]>; +def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src), + "cvtsd2ss\t{$src, $dst|$dst, $src}", + [(set FR32:$dst, (fpround (loadf64 addr:$src)))]>, + XD, Requires<[UseSSE2, OptForSize]>, + Sched<[WriteCvtSD2SS.Folded]>; + +let isCodeGenOnly = 1 in { +def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), + "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, + (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))]>, + XD, VEX_4V, VEX_WIG, Requires<[HasAVX]>, + Sched<[WriteCvtSD2SS]>; +def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), + "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, (int_x86_sse2_cvtsd2ss + VR128:$src1, sse_load_f64:$src2))]>, + XD, VEX_4V, VEX_WIG, Requires<[HasAVX]>, + Sched<[WriteCvtSD2SS.Folded, ReadAfterLd]>; +let Constraints = "$src1 = $dst" in { +def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), + "cvtsd2ss\t{$src2, $dst|$dst, $src2}", + [(set VR128:$dst, + (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))]>, + XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS]>; +def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), + "cvtsd2ss\t{$src2, $dst|$dst, $src2}", + [(set VR128:$dst, (int_x86_sse2_cvtsd2ss + VR128:$src1, sse_load_f64:$src2))]>, + XD, Requires<[UseSSE2]>, + Sched<[WriteCvtSD2SS.Folded, ReadAfterLd]>; +} +} // isCodeGenOnly = 1 + +// Convert scalar single to scalar double +// SSE2 instructions with XS prefix +let hasSideEffects = 0 in { +def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), + (ins FR64:$src1, FR32:$src2), + "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, + XS, VEX_4V, VEX_LIG, VEX_WIG, + Sched<[WriteCvtSS2SD]>, Requires<[UseAVX]>; +let mayLoad = 1 in +def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), + (ins FR64:$src1, f32mem:$src2), + "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, + XS, VEX_4V, VEX_LIG, VEX_WIG, + Sched<[WriteCvtSS2SD.Folded, ReadAfterLd]>, + Requires<[UseAVX, OptForSize]>; +} + +def : Pat<(f64 (fpextend FR32:$src)), + (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>; +def : Pat<(fpextend (loadf32 addr:$src)), + (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>; + +def : Pat<(extloadf32 addr:$src), + (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, + Requires<[UseAVX, OptForSize]>; +def : Pat<(extloadf32 addr:$src), + (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>, + Requires<[UseAVX, OptForSpeed]>; + +def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src), + "cvtss2sd\t{$src, $dst|$dst, $src}", + [(set FR64:$dst, (fpextend FR32:$src))]>, + XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>; +def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src), + "cvtss2sd\t{$src, $dst|$dst, $src}", + [(set FR64:$dst, (extloadf32 addr:$src))]>, + XS, Requires<[UseSSE2, OptForSize]>, + Sched<[WriteCvtSS2SD.Folded]>; + +// extload f32 -> f64. This matches load+fpextend because we have a hack in +// the isel (PreprocessForFPConvert) that can introduce loads after dag +// combine. +// Since these loads aren't folded into the fpextend, we have to match it +// explicitly here. +def : Pat<(fpextend (loadf32 addr:$src)), + (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2, OptForSize]>; +def : Pat<(extloadf32 addr:$src), + (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>; + +let isCodeGenOnly = 1, hasSideEffects = 0 in { +def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), + "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, XS, VEX_4V, VEX_WIG, + Requires<[HasAVX]>, Sched<[WriteCvtSS2SD]>; +let mayLoad = 1 in +def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), + "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, XS, VEX_4V, VEX_WIG, Requires<[HasAVX]>, + Sched<[WriteCvtSS2SD.Folded, ReadAfterLd]>; +let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix +def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), + "cvtss2sd\t{$src2, $dst|$dst, $src2}", + []>, XS, Requires<[UseSSE2]>, + Sched<[WriteCvtSS2SD]>; +let mayLoad = 1 in +def CVTSS2SDrm_Int: I<0x5A, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), + "cvtss2sd\t{$src2, $dst|$dst, $src2}", + []>, XS, Requires<[UseSSE2]>, + Sched<[WriteCvtSS2SD.Folded, ReadAfterLd]>; +} +} // isCodeGenOnly = 1 + +// Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and +// (v)cvtss2sd intrinsic sequences from clang which produce unnecessary +// vmovs{s,d} instructions +let Predicates = [UseAVX] in { +def : Pat<(v4f32 (X86Movss + (v4f32 VR128:$dst), + (v4f32 (scalar_to_vector + (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))), + (VCVTSD2SSrr_Int VR128:$dst, VR128:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128:$dst), + (v2f64 (scalar_to_vector + (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))), + (VCVTSS2SDrr_Int VR128:$dst, VR128:$src)>; + +def : Pat<(v4f32 (X86Movss + (v4f32 VR128:$dst), + (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))), + (VCVTSI642SSrr_Int VR128:$dst, GR64:$src)>; + +def : Pat<(v4f32 (X86Movss + (v4f32 VR128:$dst), + (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi64 addr:$src))))))), + (VCVTSI642SSrm_Int VR128:$dst, addr:$src)>; + +def : Pat<(v4f32 (X86Movss + (v4f32 VR128:$dst), + (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))), + (VCVTSI2SSrr_Int VR128:$dst, GR32:$src)>; + +def : Pat<(v4f32 (X86Movss + (v4f32 VR128:$dst), + (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi32 addr:$src))))))), + (VCVTSI2SSrm_Int VR128:$dst, addr:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128:$dst), + (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))), + (VCVTSI642SDrr_Int VR128:$dst, GR64:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128:$dst), + (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi64 addr:$src))))))), + (VCVTSI642SDrm_Int VR128:$dst, addr:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128:$dst), + (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))), + (VCVTSI2SDrr_Int VR128:$dst, GR32:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128:$dst), + (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi32 addr:$src))))))), + (VCVTSI2SDrm_Int VR128:$dst, addr:$src)>; +} // Predicates = [UseAVX] + +let Predicates = [UseSSE2] in { +def : Pat<(v4f32 (X86Movss + (v4f32 VR128:$dst), + (v4f32 (scalar_to_vector + (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))), + (CVTSD2SSrr_Int VR128:$dst, VR128:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128:$dst), + (v2f64 (scalar_to_vector + (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))), + (CVTSS2SDrr_Int VR128:$dst, VR128:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128:$dst), + (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))), + (CVTSI642SDrr_Int VR128:$dst, GR64:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128:$dst), + (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi64 addr:$src))))))), + (CVTSI642SDrm_Int VR128:$dst, addr:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128:$dst), + (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))), + (CVTSI2SDrr_Int VR128:$dst, GR32:$src)>; + +def : Pat<(v2f64 (X86Movsd + (v2f64 VR128:$dst), + (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi32 addr:$src))))))), + (CVTSI2SDrm_Int VR128:$dst, addr:$src)>; +} // Predicates = [UseSSE2] + +let Predicates = [UseSSE1] in { +def : Pat<(v4f32 (X86Movss + (v4f32 VR128:$dst), + (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))), + (CVTSI642SSrr_Int VR128:$dst, GR64:$src)>; + +def : Pat<(v4f32 (X86Movss + (v4f32 VR128:$dst), + (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi64 addr:$src))))))), + (CVTSI642SSrm_Int VR128:$dst, addr:$src)>; + +def : Pat<(v4f32 (X86Movss + (v4f32 VR128:$dst), + (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))), + (CVTSI2SSrr_Int VR128:$dst, GR32:$src)>; + +def : Pat<(v4f32 (X86Movss + (v4f32 VR128:$dst), + (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi32 addr:$src))))))), + (CVTSI2SSrm_Int VR128:$dst, addr:$src)>; +} // Predicates = [UseSSE1] + +let Predicates = [HasAVX, NoVLX] in { +// Convert packed single/double fp to doubleword +def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvtps2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>, + VEX, Sched<[WriteCvtPS2I]>, VEX_WIG; +def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "cvtps2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (X86cvtp2Int (loadv4f32 addr:$src))))]>, + VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG; +def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + "cvtps2dq\t{$src, $dst|$dst, $src}", + [(set VR256:$dst, + (v8i32 (X86cvtp2Int (v8f32 VR256:$src))))]>, + VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG; +def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), + "cvtps2dq\t{$src, $dst|$dst, $src}", + [(set VR256:$dst, + (v8i32 (X86cvtp2Int (loadv8f32 addr:$src))))]>, + VEX, VEX_L, Sched<[WriteCvtPS2IYLd]>, VEX_WIG; +} +def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvtps2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>, + Sched<[WriteCvtPS2I]>; +def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "cvtps2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (X86cvtp2Int (memopv4f32 addr:$src))))]>, + Sched<[WriteCvtPS2ILd]>; + + +// Convert Packed Double FP to Packed DW Integers +let Predicates = [HasAVX, NoVLX] in { +// The assembler can recognize rr 256-bit instructions by seeing a ymm +// register, but the same isn't true when using memory operands instead. +// Provide other assembly rr and rm forms to address this explicitly. +def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "vcvtpd2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>, + VEX, Sched<[WriteCvtPD2I]>, VEX_WIG; + +// XMM only +def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}", + (VCVTPD2DQrr VR128:$dst, VR128:$src), 0>; +def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX, + Sched<[WriteCvtPD2ILd]>, VEX_WIG; +def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}", + (VCVTPD2DQrm VR128:$dst, f128mem:$src), 0, "intel">; + +// YMM only +def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), + "vcvtpd2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>, + VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG; +def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), + "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>, + VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG; +def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}", + (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0>; +def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}", + (VCVTPD2DQYrm VR128:$dst, f256mem:$src), 0, "intel">; +} + +def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "cvtpd2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))]>, + Sched<[WriteCvtPD2ILd]>; +def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvtpd2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>, + Sched<[WriteCvtPD2I]>; + +// Convert with truncation packed single/double fp to doubleword +// SSE2 packed instructions with XS prefix +let Predicates = [HasAVX, NoVLX] in { +def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvttps2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (X86cvttp2si (v4f32 VR128:$src))))]>, + VEX, Sched<[WriteCvtPS2I]>, VEX_WIG; +def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "cvttps2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (X86cvttp2si (loadv4f32 addr:$src))))]>, + VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG; +def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + "cvttps2dq\t{$src, $dst|$dst, $src}", + [(set VR256:$dst, + (v8i32 (X86cvttp2si (v8f32 VR256:$src))))]>, + VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG; +def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), + "cvttps2dq\t{$src, $dst|$dst, $src}", + [(set VR256:$dst, + (v8i32 (X86cvttp2si (loadv8f32 addr:$src))))]>, + VEX, VEX_L, + Sched<[WriteCvtPS2IYLd]>, VEX_WIG; +} + +let Predicates = [HasAVX, NoVLX] in { + def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))), + (VCVTTPS2DQrr VR128:$src)>; + def : Pat<(v4i32 (fp_to_sint (loadv4f32 addr:$src))), + (VCVTTPS2DQrm addr:$src)>; + def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))), + (VCVTTPS2DQYrr VR256:$src)>; + def : Pat<(v8i32 (fp_to_sint (loadv8f32 addr:$src))), + (VCVTTPS2DQYrm addr:$src)>; +} + +def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvttps2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (X86cvttp2si (v4f32 VR128:$src))))]>, + Sched<[WriteCvtPS2I]>; +def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "cvttps2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (X86cvttp2si (memopv4f32 addr:$src))))]>, + Sched<[WriteCvtPS2ILd]>; + +let Predicates = [UseSSE2] in { + def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))), + (CVTTPS2DQrr VR128:$src)>; + def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))), + (CVTTPS2DQrm addr:$src)>; +} + +let Predicates = [HasAVX, NoVLX] in +def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvttpd2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (X86cvttp2si (v2f64 VR128:$src))))]>, + VEX, Sched<[WriteCvtPD2I]>, VEX_WIG; + +// The assembler can recognize rr 256-bit instructions by seeing a ymm +// register, but the same isn't true when using memory operands instead. +// Provide other assembly rr and rm forms to address this explicitly. + +// XMM only +def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}", + (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0>; + +let Predicates = [HasAVX, NoVLX] in +def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "cvttpd2dq{x}\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (X86cvttp2si (loadv2f64 addr:$src))))]>, + VEX, Sched<[WriteCvtPD2ILd]>, VEX_WIG; +def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}", + (VCVTTPD2DQrm VR128:$dst, f128mem:$src), 0, "intel">; + +// YMM only +let Predicates = [HasAVX, NoVLX] in { +def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), + "cvttpd2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (X86cvttp2si (v4f64 VR256:$src))))]>, + VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG; +def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), + "cvttpd2dq{y}\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (X86cvttp2si (loadv4f64 addr:$src))))]>, + VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG; +} +def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}", + (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>; +def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}", + (VCVTTPD2DQYrm VR128:$dst, f256mem:$src), 0, "intel">; + +let Predicates = [HasAVX, NoVLX] in { + def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))), + (VCVTTPD2DQYrr VR256:$src)>; + def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))), + (VCVTTPD2DQYrm addr:$src)>; +} + +let Predicates = [HasAVX, NoVLX] in { + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))), + (VCVTPD2DQrr VR128:$src)>; + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvtp2Int (loadv2f64 addr:$src)))))), + (VCVTPD2DQrm addr:$src)>; + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))), + (VCVTTPD2DQrr VR128:$src)>; + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvttp2si (loadv2f64 addr:$src)))))), + (VCVTTPD2DQrm addr:$src)>; +} // Predicates = [HasAVX, NoVLX] + +def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvttpd2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (X86cvttp2si (v2f64 VR128:$src))))]>, + Sched<[WriteCvtPD2I]>; +def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src), + "cvttpd2dq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (X86cvttp2si (memopv2f64 addr:$src))))]>, + Sched<[WriteCvtPD2ILd]>; + +let Predicates = [UseSSE2] in { + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))), + (CVTPD2DQrr VR128:$src)>; + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvtp2Int (memopv2f64 addr:$src)))))), + (CVTPD2DQrm addr:$src)>; + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))), + (CVTTPD2DQrr VR128:$src)>; + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvttp2si (memopv2f64 addr:$src)))))), + (CVTTPD2DQrm addr:$src)>; +} // Predicates = [UseSSE2] + +// Convert packed single to packed double +let Predicates = [HasAVX, NoVLX] in { + // SSE2 instructions without OpSize prefix +def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "vcvtps2pd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))]>, + PS, VEX, Sched<[WriteCvtPS2PD]>, VEX_WIG; +def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), + "vcvtps2pd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>, + PS, VEX, Sched<[WriteCvtPS2PD.Folded]>, VEX_WIG; +def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), + "vcvtps2pd\t{$src, $dst|$dst, $src}", + [(set VR256:$dst, (v4f64 (fpextend (v4f32 VR128:$src))))]>, + PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY]>, VEX_WIG; +def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src), + "vcvtps2pd\t{$src, $dst|$dst, $src}", + [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))]>, + PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY.Folded]>, VEX_WIG; +} + +let Predicates = [UseSSE2] in { +def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvtps2pd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))]>, + PS, Sched<[WriteCvtPS2PD]>; +def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), + "cvtps2pd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>, + PS, Sched<[WriteCvtPS2PD.Folded]>; +} + +// Convert Packed DW Integers to Packed Double FP +let Predicates = [HasAVX, NoVLX] in { +let hasSideEffects = 0, mayLoad = 1 in +def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), + "vcvtdq2pd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))]>, + VEX, Sched<[WriteCvtI2PDLd]>, VEX_WIG; +def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "vcvtdq2pd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2f64 (X86VSintToFP (v4i32 VR128:$src))))]>, + VEX, Sched<[WriteCvtI2PD]>, VEX_WIG; +def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src), + "vcvtdq2pd\t{$src, $dst|$dst, $src}", + [(set VR256:$dst, + (v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))))]>, + VEX, VEX_L, Sched<[WriteCvtI2PDYLd]>, + VEX_WIG; +def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), + "vcvtdq2pd\t{$src, $dst|$dst, $src}", + [(set VR256:$dst, + (v4f64 (sint_to_fp (v4i32 VR128:$src))))]>, + VEX, VEX_L, Sched<[WriteCvtI2PDY]>, VEX_WIG; +} + +let hasSideEffects = 0, mayLoad = 1 in +def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), + "cvtdq2pd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))]>, + Sched<[WriteCvtI2PDLd]>; +def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvtdq2pd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2f64 (X86VSintToFP (v4i32 VR128:$src))))]>, + Sched<[WriteCvtI2PD]>; + +// AVX register conversion intrinsics +let Predicates = [HasAVX, NoVLX] in { + def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (VCVTDQ2PDrm addr:$src)>; + def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))), + (VCVTDQ2PDrm addr:$src)>; +} // Predicates = [HasAVX, NoVLX] + +// SSE2 register conversion intrinsics +let Predicates = [UseSSE2] in { + def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (CVTDQ2PDrm addr:$src)>; + def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))), + (CVTDQ2PDrm addr:$src)>; +} // Predicates = [UseSSE2] + +// Convert packed double to packed single +// The assembler can recognize rr 256-bit instructions by seeing a ymm +// register, but the same isn't true when using memory operands instead. +// Provide other assembly rr and rm forms to address this explicitly. +let Predicates = [HasAVX, NoVLX] in +def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvtpd2ps\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))]>, + VEX, Sched<[WriteCvtPD2PS]>, VEX_WIG; + +// XMM only +def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}", + (VCVTPD2PSrr VR128:$dst, VR128:$src), 0>; +let Predicates = [HasAVX, NoVLX] in +def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "cvtpd2ps{x}\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (X86vfpround (loadv2f64 addr:$src)))]>, + VEX, Sched<[WriteCvtPD2PS.Folded]>, VEX_WIG; +def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}", + (VCVTPD2PSrm VR128:$dst, f128mem:$src), 0, "intel">; + +// YMM only +let Predicates = [HasAVX, NoVLX] in { +def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), + "cvtpd2ps\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (fpround VR256:$src))]>, + VEX, VEX_L, Sched<[WriteCvtPD2PSY]>, VEX_WIG; +def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), + "cvtpd2ps{y}\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (fpround (loadv4f64 addr:$src)))]>, + VEX, VEX_L, Sched<[WriteCvtPD2PSY.Folded]>, VEX_WIG; +} +def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}", + (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0>; +def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}", + (VCVTPD2PSYrm VR128:$dst, f256mem:$src), 0, "intel">; + +def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvtpd2ps\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))]>, + Sched<[WriteCvtPD2PS]>; +def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "cvtpd2ps\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (X86vfpround (memopv2f64 addr:$src)))]>, + Sched<[WriteCvtPD2PS.Folded]>; + +// AVX 256-bit register conversion intrinsics +// FIXME: Migrate SSE conversion intrinsics matching to use patterns as below +// whenever possible to avoid declaring two versions of each one. + +let Predicates = [HasAVX, NoVLX] in { + // Match fpround and fpextend for 128/256-bit conversions + def : Pat<(X86vzmovl (v2f64 (bitconvert + (v4f32 (X86vfpround (v2f64 VR128:$src)))))), + (VCVTPD2PSrr VR128:$src)>; + def : Pat<(X86vzmovl (v2f64 (bitconvert + (v4f32 (X86vfpround (loadv2f64 addr:$src)))))), + (VCVTPD2PSrm addr:$src)>; +} + +let Predicates = [UseSSE2] in { + // Match fpround and fpextend for 128 conversions + def : Pat<(X86vzmovl (v2f64 (bitconvert + (v4f32 (X86vfpround (v2f64 VR128:$src)))))), + (CVTPD2PSrr VR128:$src)>; + def : Pat<(X86vzmovl (v2f64 (bitconvert + (v4f32 (X86vfpround (memopv2f64 addr:$src)))))), + (CVTPD2PSrm addr:$src)>; +} + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Compare Instructions +//===----------------------------------------------------------------------===// + +// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions +multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, + Operand CC, SDNode OpNode, ValueType VT, + PatFrag ld_frag, string asm, string asm_alt, + X86FoldableSchedWrite sched> { + let isCommutable = 1 in + def rr : SIi8<0xC2, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm, + [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))]>, + Sched<[sched]>; + def rm : SIi8<0xC2, MRMSrcMem, + (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm, + [(set RC:$dst, (OpNode (VT RC:$src1), + (ld_frag addr:$src2), imm:$cc))]>, + Sched<[sched.Folded, ReadAfterLd]>; + + // Accept explicit immediate argument form instead of comparison code. + let isAsmParserOnly = 1, hasSideEffects = 0 in { + def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, u8imm:$cc), asm_alt, []>, + Sched<[sched]>, NotMemoryFoldable; + let mayLoad = 1 in + def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm_alt, []>, + Sched<[sched.Folded, ReadAfterLd]>, NotMemoryFoldable; + } +} + +let ExeDomain = SSEPackedSingle in +defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmps, f32, loadf32, + "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", + "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", + SchedWriteFCmpSizes.PS.Scl>, XS, VEX_4V, VEX_LIG, VEX_WIG; +let ExeDomain = SSEPackedDouble in +defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmps, f64, loadf64, + "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", + SchedWriteFCmpSizes.PD.Scl>, + XD, VEX_4V, VEX_LIG, VEX_WIG; + +let Constraints = "$src1 = $dst" in { + let ExeDomain = SSEPackedSingle in + defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmps, f32, loadf32, + "cmp${cc}ss\t{$src2, $dst|$dst, $src2}", + "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", + SchedWriteFCmpSizes.PS.Scl>, XS; + let ExeDomain = SSEPackedDouble in + defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmps, f64, loadf64, + "cmp${cc}sd\t{$src2, $dst|$dst, $src2}", + "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}", + SchedWriteFCmpSizes.PD.Scl>, XD; +} + +multiclass sse12_cmp_scalar_int<Operand memop, Operand CC, + Intrinsic Int, string asm, X86FoldableSchedWrite sched, + ComplexPattern mem_cpat> { + def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src, CC:$cc), asm, + [(set VR128:$dst, (Int VR128:$src1, + VR128:$src, imm:$cc))]>, + Sched<[sched]>; +let mayLoad = 1 in + def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, memop:$src, CC:$cc), asm, + [(set VR128:$dst, (Int VR128:$src1, + mem_cpat:$src, imm:$cc))]>, + Sched<[sched.Folded, ReadAfterLd]>; +} + +let isCodeGenOnly = 1 in { + // Aliases to match intrinsics which expect XMM operand(s). + let ExeDomain = SSEPackedSingle in + defm VCMPSS : sse12_cmp_scalar_int<ssmem, AVXCC, int_x86_sse_cmp_ss, + "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}", + SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS, VEX_4V; + let ExeDomain = SSEPackedDouble in + defm VCMPSD : sse12_cmp_scalar_int<sdmem, AVXCC, int_x86_sse2_cmp_sd, + "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}", + SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, + XD, VEX_4V; + let Constraints = "$src1 = $dst" in { + let ExeDomain = SSEPackedSingle in + defm CMPSS : sse12_cmp_scalar_int<ssmem, SSECC, int_x86_sse_cmp_ss, + "cmp${cc}ss\t{$src, $dst|$dst, $src}", + SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS; + let ExeDomain = SSEPackedDouble in + defm CMPSD : sse12_cmp_scalar_int<sdmem, SSECC, int_x86_sse2_cmp_sd, + "cmp${cc}sd\t{$src, $dst|$dst, $src}", + SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD; +} +} + + +// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS +multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode, + ValueType vt, X86MemOperand x86memop, + PatFrag ld_frag, string OpcodeStr, + X86FoldableSchedWrite sched> { +let hasSideEffects = 0 in { + def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), + [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>, + Sched<[sched]>; +let mayLoad = 1 in + def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), + [(set EFLAGS, (OpNode (vt RC:$src1), + (ld_frag addr:$src2)))]>, + Sched<[sched.Folded, ReadAfterLd]>; +} +} + +// sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp +multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode, + ValueType vt, Operand memop, + ComplexPattern mem_cpat, string OpcodeStr, + X86FoldableSchedWrite sched> { + def rr_Int: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), + [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>, + Sched<[sched]>; +let mayLoad = 1 in + def rm_Int: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), + [(set EFLAGS, (OpNode (vt RC:$src1), + mem_cpat:$src2))]>, + Sched<[sched.Folded, ReadAfterLd]>; +} + +let Defs = [EFLAGS] in { + defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, + "ucomiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG; + defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, + "ucomisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG; + let Pattern = []<dag> in { + defm VCOMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32, + "comiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG; + defm VCOMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64, + "comisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG; + } + + let isCodeGenOnly = 1 in { + defm VUCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem, + sse_load_f32, "ucomiss", WriteFCom>, PS, VEX, VEX_WIG; + defm VUCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem, + sse_load_f64, "ucomisd", WriteFCom>, PD, VEX, VEX_WIG; + + defm VCOMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem, + sse_load_f32, "comiss", WriteFCom>, PS, VEX, VEX_WIG; + defm VCOMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem, + sse_load_f64, "comisd", WriteFCom>, PD, VEX, VEX_WIG; + } + defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, + "ucomiss", WriteFCom>, PS; + defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, + "ucomisd", WriteFCom>, PD; + + let Pattern = []<dag> in { + defm COMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32, + "comiss", WriteFCom>, PS; + defm COMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64, + "comisd", WriteFCom>, PD; + } + + let isCodeGenOnly = 1 in { + defm UCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem, + sse_load_f32, "ucomiss", WriteFCom>, PS; + defm UCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem, + sse_load_f64, "ucomisd", WriteFCom>, PD; + + defm COMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem, + sse_load_f32, "comiss", WriteFCom>, PS; + defm COMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem, + sse_load_f64, "comisd", WriteFCom>, PD; + } +} // Defs = [EFLAGS] + +// sse12_cmp_packed - sse 1 & 2 compare packed instructions +multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, + Operand CC, ValueType VT, string asm, + string asm_alt, X86FoldableSchedWrite sched, + Domain d, PatFrag ld_frag> { + let isCommutable = 1 in + def rri : PIi8<0xC2, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm, + [(set RC:$dst, (VT (X86cmpp RC:$src1, RC:$src2, imm:$cc)))], d>, + Sched<[sched]>; + def rmi : PIi8<0xC2, MRMSrcMem, + (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm, + [(set RC:$dst, + (VT (X86cmpp RC:$src1, (ld_frag addr:$src2), imm:$cc)))], d>, + Sched<[sched.Folded, ReadAfterLd]>; + + // Accept explicit immediate argument form instead of comparison code. + let isAsmParserOnly = 1, hasSideEffects = 0 in { + def rri_alt : PIi8<0xC2, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), + asm_alt, [], d>, Sched<[sched]>, NotMemoryFoldable; + let mayLoad = 1 in + def rmi_alt : PIi8<0xC2, MRMSrcMem, + (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), + asm_alt, [], d>, Sched<[sched.Folded, ReadAfterLd]>, + NotMemoryFoldable; + } +} + +defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, v4f32, + "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", + SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, PS, VEX_4V, VEX_WIG; +defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, v2f64, + "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", + SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, PD, VEX_4V, VEX_WIG; +defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, v8f32, + "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", + SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, PS, VEX_4V, VEX_L, VEX_WIG; +defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, v4f64, + "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", + SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, PD, VEX_4V, VEX_L, VEX_WIG; +let Constraints = "$src1 = $dst" in { + defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, v4f32, + "cmp${cc}ps\t{$src2, $dst|$dst, $src2}", + "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}", + SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, memopv4f32>, PS; + defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, v2f64, + "cmp${cc}pd\t{$src2, $dst|$dst, $src2}", + "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}", + SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, PD; +} + +def CommutableCMPCC : PatLeaf<(imm), [{ + uint64_t Imm = N->getZExtValue() & 0x7; + return (Imm == 0x00 || Imm == 0x03 || Imm == 0x04 || Imm == 0x07); +}]>; + +// Patterns to select compares with loads in first operand. +let Predicates = [HasAVX] in { + def : Pat<(v4f64 (X86cmpp (loadv4f64 addr:$src2), VR256:$src1, + CommutableCMPCC:$cc)), + (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>; + + def : Pat<(v8f32 (X86cmpp (loadv8f32 addr:$src2), VR256:$src1, + CommutableCMPCC:$cc)), + (VCMPPSYrmi VR256:$src1, addr:$src2, imm:$cc)>; + + def : Pat<(v2f64 (X86cmpp (loadv2f64 addr:$src2), VR128:$src1, + CommutableCMPCC:$cc)), + (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>; + + def : Pat<(v4f32 (X86cmpp (loadv4f32 addr:$src2), VR128:$src1, + CommutableCMPCC:$cc)), + (VCMPPSrmi VR128:$src1, addr:$src2, imm:$cc)>; + + def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1, + CommutableCMPCC:$cc)), + (VCMPSDrm FR64:$src1, addr:$src2, imm:$cc)>; + + def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1, + CommutableCMPCC:$cc)), + (VCMPSSrm FR32:$src1, addr:$src2, imm:$cc)>; +} + +let Predicates = [UseSSE2] in { + def : Pat<(v2f64 (X86cmpp (memopv2f64 addr:$src2), VR128:$src1, + CommutableCMPCC:$cc)), + (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>; + + def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1, + CommutableCMPCC:$cc)), + (CMPSDrm FR64:$src1, addr:$src2, imm:$cc)>; +} + +let Predicates = [UseSSE1] in { + def : Pat<(v4f32 (X86cmpp (memopv4f32 addr:$src2), VR128:$src1, + CommutableCMPCC:$cc)), + (CMPPSrmi VR128:$src1, addr:$src2, imm:$cc)>; + + def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1, + CommutableCMPCC:$cc)), + (CMPSSrm FR32:$src1, addr:$src2, imm:$cc)>; +} + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Shuffle Instructions +//===----------------------------------------------------------------------===// + +/// sse12_shuffle - sse 1 & 2 fp shuffle instructions +multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop, + ValueType vt, string asm, PatFrag mem_frag, + X86FoldableSchedWrite sched, Domain d> { + def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm, + [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2), + (i8 imm:$src3))))], d>, + Sched<[sched.Folded, ReadAfterLd]>; + def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, u8imm:$src3), asm, + [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, + (i8 imm:$src3))))], d>, + Sched<[sched]>; +} + +let Predicates = [HasAVX, NoVLX] in { + defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32, + "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + loadv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, + PS, VEX_4V, VEX_WIG; + defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32, + "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + loadv8f32, SchedWriteFShuffle.YMM, SSEPackedSingle>, + PS, VEX_4V, VEX_L, VEX_WIG; + defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64, + "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + loadv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>, + PD, VEX_4V, VEX_WIG; + defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64, + "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + loadv4f64, SchedWriteFShuffle.YMM, SSEPackedDouble>, + PD, VEX_4V, VEX_L, VEX_WIG; +} +let Constraints = "$src1 = $dst" in { + defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32, + "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}", + memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, PS; + defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64, + "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}", + memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>, PD; +} + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Unpack FP Instructions +//===----------------------------------------------------------------------===// + +/// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave +multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt, + PatFrag mem_frag, RegisterClass RC, + X86MemOperand x86memop, string asm, + X86FoldableSchedWrite sched, Domain d, + bit IsCommutable = 0> { + let isCommutable = IsCommutable in + def rr : PI<opc, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, RC:$src2), + asm, [(set RC:$dst, + (vt (OpNode RC:$src1, RC:$src2)))], d>, + Sched<[sched]>; + def rm : PI<opc, MRMSrcMem, + (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + asm, [(set RC:$dst, + (vt (OpNode RC:$src1, + (mem_frag addr:$src2))))], d>, + Sched<[sched.Folded, ReadAfterLd]>; +} + +let Predicates = [HasAVX, NoVLX] in { +defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32, + VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG; +defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64, + VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX_4V, VEX_WIG; +defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32, + VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG; +defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, loadv2f64, + VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX_4V, VEX_WIG; + +defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, loadv8f32, + VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG; +defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, loadv4f64, + VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG; +defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32, + VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG; +defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64, + VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", + SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG; +}// Predicates = [HasAVX, NoVLX] + +let Constraints = "$src1 = $dst" in { + defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32, + VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}", + SchedWriteFShuffle.XMM, SSEPackedSingle>, PS; + defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64, + VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}", + SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD; + defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32, + VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}", + SchedWriteFShuffle.XMM, SSEPackedSingle>, PS; + defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64, + VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}", + SchedWriteFShuffle.XMM, SSEPackedDouble>, PD; +} // Constraints = "$src1 = $dst" + +let Predicates = [HasAVX1Only] in { + def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))), + (VUNPCKLPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)), + (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))), + (VUNPCKHPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)), + (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>; + + def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))), + (VUNPCKLPDYrm VR256:$src1, addr:$src2)>; + def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)), + (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))), + (VUNPCKHPDYrm VR256:$src1, addr:$src2)>; + def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)), + (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>; +} + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Extract Floating-Point Sign mask +//===----------------------------------------------------------------------===// + +/// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave +multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt, + string asm, Domain d> { + def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), + [(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], d>, + Sched<[WriteFMOVMSK]>; +} + +let Predicates = [HasAVX] in { + defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps", + SSEPackedSingle>, PS, VEX, VEX_WIG; + defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd", + SSEPackedDouble>, PD, VEX, VEX_WIG; + defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps", + SSEPackedSingle>, PS, VEX, VEX_L, VEX_WIG; + defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd", + SSEPackedDouble>, PD, VEX, VEX_L, VEX_WIG; +} + +defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps", + SSEPackedSingle>, PS; +defm MOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd", + SSEPackedDouble>, PD; + +//===---------------------------------------------------------------------===// +// SSE2 - Packed Integer Logical Instructions +//===---------------------------------------------------------------------===// + +let ExeDomain = SSEPackedInt in { // SSE integer instructions + +/// PDI_binop_rm - Simple SSE2 binary operator. +multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType OpVT, RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop, X86FoldableSchedWrite sched, + bit IsCommutable, bit Is2Addr> { + let isCommutable = IsCommutable in + def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, + Sched<[sched]>; + def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (OpVT (OpNode RC:$src1, + (bitconvert (memop_frag addr:$src2)))))]>, + Sched<[sched.Folded, ReadAfterLd]>; +} +} // ExeDomain = SSEPackedInt + +multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode, + ValueType OpVT128, ValueType OpVT256, + X86SchedWriteWidths sched, bit IsCommutable, + Predicate prd> { +let Predicates = [HasAVX, prd] in + defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128, + VR128, loadv2i64, i128mem, sched.XMM, + IsCommutable, 0>, VEX_4V, VEX_WIG; + +let Constraints = "$src1 = $dst" in + defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128, + memopv2i64, i128mem, sched.XMM, IsCommutable, 1>; + +let Predicates = [HasAVX2, prd] in + defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, + OpVT256, VR256, loadv4i64, i256mem, sched.YMM, + IsCommutable, 0>, VEX_4V, VEX_L, VEX_WIG; +} + +// These are ordered here for pattern ordering requirements with the fp versions + +defm PAND : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64, + SchedWriteVecLogic, 1, NoVLX>; +defm POR : PDI_binop_all<0xEB, "por", or, v2i64, v4i64, + SchedWriteVecLogic, 1, NoVLX>; +defm PXOR : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64, + SchedWriteVecLogic, 1, NoVLX>; +defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64, + SchedWriteVecLogic, 0, NoVLX>; + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Logical Instructions +//===----------------------------------------------------------------------===// + +/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops +/// +/// There are no patterns here because isel prefers integer versions for SSE2 +/// and later. There are SSE1 v4f32 patterns later. +multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, + SDNode OpNode, X86SchedWriteWidths sched> { + let Predicates = [HasAVX, NoVLX] in { + defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle, + !strconcat(OpcodeStr, "ps"), f256mem, sched.YMM, + [], [], 0>, PS, VEX_4V, VEX_L, VEX_WIG; + + defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble, + !strconcat(OpcodeStr, "pd"), f256mem, sched.YMM, + [], [], 0>, PD, VEX_4V, VEX_L, VEX_WIG; + + defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, + !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM, + [], [], 0>, PS, VEX_4V, VEX_WIG; + + defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble, + !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM, + [], [], 0>, PD, VEX_4V, VEX_WIG; + } + + let Constraints = "$src1 = $dst" in { + defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, + !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM, + [], []>, PS; + + defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble, + !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM, + [], []>, PD; + } +} + +defm AND : sse12_fp_packed_logical<0x54, "and", and, SchedWriteFLogic>; +defm OR : sse12_fp_packed_logical<0x56, "or", or, SchedWriteFLogic>; +defm XOR : sse12_fp_packed_logical<0x57, "xor", xor, SchedWriteFLogic>; +let isCommutable = 0 in + defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp, SchedWriteFLogic>; + +// If only AVX1 is supported, we need to handle integer operations with +// floating point instructions since the integer versions aren't available. +let Predicates = [HasAVX1Only] in { + def : Pat<(v4i64 (and VR256:$src1, VR256:$src2)), + (VANDPSYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v4i64 (or VR256:$src1, VR256:$src2)), + (VORPSYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v4i64 (xor VR256:$src1, VR256:$src2)), + (VXORPSYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)), + (VANDNPSYrr VR256:$src1, VR256:$src2)>; + + def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)), + (VANDPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)), + (VORPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)), + (VXORPSYrm VR256:$src1, addr:$src2)>; + def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)), + (VANDNPSYrm VR256:$src1, addr:$src2)>; +} + +let Predicates = [HasAVX, NoVLX_Or_NoDQI] in { + // Use packed logical operations for scalar ops. + def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)), + (COPY_TO_REGCLASS + (v2f64 (VANDPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), + (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), + FR64)>; + def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)), + (COPY_TO_REGCLASS + (v2f64 (VORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), + (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), + FR64)>; + def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)), + (COPY_TO_REGCLASS + (v2f64 (VXORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), + (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), + FR64)>; + def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)), + (COPY_TO_REGCLASS + (v2f64 (VANDNPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), + (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), + FR64)>; + + def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)), + (COPY_TO_REGCLASS + (v4f32 (VANDPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), + (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), + FR32)>; + def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)), + (COPY_TO_REGCLASS + (v4f32 (VORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), + (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), + FR32)>; + def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)), + (COPY_TO_REGCLASS + (v4f32 (VXORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), + (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), + FR32)>; + def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)), + (COPY_TO_REGCLASS + (v4f32 (VANDNPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), + (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), + FR32)>; +} + +let Predicates = [UseSSE1] in { + // Use packed logical operations for scalar ops. + def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)), + (COPY_TO_REGCLASS + (v4f32 (ANDPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), + (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), + FR32)>; + def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)), + (COPY_TO_REGCLASS + (v4f32 (ORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), + (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), + FR32)>; + def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)), + (COPY_TO_REGCLASS + (v4f32 (XORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), + (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), + FR32)>; + def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)), + (COPY_TO_REGCLASS + (v4f32 (ANDNPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)), + (v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))), + FR32)>; +} + +let Predicates = [UseSSE2] in { + // Use packed logical operations for scalar ops. + def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)), + (COPY_TO_REGCLASS + (v2f64 (ANDPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), + (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), + FR64)>; + def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)), + (COPY_TO_REGCLASS + (v2f64 (ORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), + (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), + FR64)>; + def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)), + (COPY_TO_REGCLASS + (v2f64 (XORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), + (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), + FR64)>; + def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)), + (COPY_TO_REGCLASS + (v2f64 (ANDNPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)), + (v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))), + FR64)>; +} + +// Patterns for packed operations when we don't have integer type available. +def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)), + (ANDPSrr VR128:$src1, VR128:$src2)>; +def : Pat<(v4f32 (X86for VR128:$src1, VR128:$src2)), + (ORPSrr VR128:$src1, VR128:$src2)>; +def : Pat<(v4f32 (X86fxor VR128:$src1, VR128:$src2)), + (XORPSrr VR128:$src1, VR128:$src2)>; +def : Pat<(v4f32 (X86fandn VR128:$src1, VR128:$src2)), + (ANDNPSrr VR128:$src1, VR128:$src2)>; + +def : Pat<(X86fand VR128:$src1, (memopv4f32 addr:$src2)), + (ANDPSrm VR128:$src1, addr:$src2)>; +def : Pat<(X86for VR128:$src1, (memopv4f32 addr:$src2)), + (ORPSrm VR128:$src1, addr:$src2)>; +def : Pat<(X86fxor VR128:$src1, (memopv4f32 addr:$src2)), + (XORPSrm VR128:$src1, addr:$src2)>; +def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)), + (ANDNPSrm VR128:$src1, addr:$src2)>; + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Arithmetic Instructions +//===----------------------------------------------------------------------===// + +/// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and +/// vector forms. +/// +/// In addition, we also have a special variant of the scalar form here to +/// represent the associated intrinsic operation. This form is unlike the +/// plain scalar form, in that it takes an entire vector (instead of a scalar) +/// and leaves the top elements unmodified (therefore these cannot be commuted). +/// +/// These three forms can each be reg+reg or reg+mem. +/// + +/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those +/// classes below +multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr, + SDNode OpNode, X86SchedWriteSizes sched> { + let Predicates = [HasAVX, NoVLX] in { + defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, + VR128, v4f32, f128mem, loadv4f32, + SSEPackedSingle, sched.PS.XMM, 0>, PS, VEX_4V, VEX_WIG; + defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, + VR128, v2f64, f128mem, loadv2f64, + SSEPackedDouble, sched.PD.XMM, 0>, PD, VEX_4V, VEX_WIG; + + defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), + OpNode, VR256, v8f32, f256mem, loadv8f32, + SSEPackedSingle, sched.PS.YMM, 0>, PS, VEX_4V, VEX_L, VEX_WIG; + defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), + OpNode, VR256, v4f64, f256mem, loadv4f64, + SSEPackedDouble, sched.PD.YMM, 0>, PD, VEX_4V, VEX_L, VEX_WIG; + } + + let Constraints = "$src1 = $dst" in { + defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128, + v4f32, f128mem, memopv4f32, SSEPackedSingle, + sched.PS.XMM>, PS; + defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128, + v2f64, f128mem, memopv2f64, SSEPackedDouble, + sched.PD.XMM>, PD; + } +} + +multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86SchedWriteSizes sched> { + defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), + OpNode, FR32, f32mem, SSEPackedSingle, sched.PS.Scl, 0>, + XS, VEX_4V, VEX_LIG, VEX_WIG; + defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"), + OpNode, FR64, f64mem, SSEPackedDouble, sched.PD.Scl, 0>, + XD, VEX_4V, VEX_LIG, VEX_WIG; + + let Constraints = "$src1 = $dst" in { + defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"), + OpNode, FR32, f32mem, SSEPackedSingle, + sched.PS.Scl>, XS; + defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"), + OpNode, FR64, f64mem, SSEPackedDouble, + sched.PD.Scl>, XD; + } +} + +multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr, + SDPatternOperator OpNode, + X86SchedWriteSizes sched> { + defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32, + !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32, + SSEPackedSingle, sched.PS.Scl, 0>, XS, VEX_4V, VEX_LIG, VEX_WIG; + defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64, + !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64, + SSEPackedDouble, sched.PD.Scl, 0>, XD, VEX_4V, VEX_LIG, VEX_WIG; + + let Constraints = "$src1 = $dst" in { + defm SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32, + !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32, + SSEPackedSingle, sched.PS.Scl>, XS; + defm SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64, + !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64, + SSEPackedDouble, sched.PD.Scl>, XD; + } +} + +// Binary Arithmetic instructions +defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SchedWriteFAddSizes>, + basic_sse12_fp_binop_s<0x58, "add", fadd, SchedWriteFAddSizes>, + basic_sse12_fp_binop_s_int<0x58, "add", null_frag, SchedWriteFAddSizes>; +defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SchedWriteFMulSizes>, + basic_sse12_fp_binop_s<0x59, "mul", fmul, SchedWriteFMulSizes>, + basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, SchedWriteFMulSizes>; +let isCommutable = 0 in { + defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SchedWriteFAddSizes>, + basic_sse12_fp_binop_s<0x5C, "sub", fsub, SchedWriteFAddSizes>, + basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, SchedWriteFAddSizes>; + defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SchedWriteFDivSizes>, + basic_sse12_fp_binop_s<0x5E, "div", fdiv, SchedWriteFDivSizes>, + basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, SchedWriteFDivSizes>; + defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SchedWriteFCmpSizes>, + basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SchedWriteFCmpSizes>, + basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SchedWriteFCmpSizes>; + defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SchedWriteFCmpSizes>, + basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SchedWriteFCmpSizes>, + basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SchedWriteFCmpSizes>; +} + +let isCodeGenOnly = 1 in { + defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>, + basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SchedWriteFCmpSizes>; + defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SchedWriteFCmpSizes>, + basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SchedWriteFCmpSizes>; +} + +// Patterns used to select SSE scalar fp arithmetic instructions from +// either: +// +// (1) a scalar fp operation followed by a blend +// +// The effect is that the backend no longer emits unnecessary vector +// insert instructions immediately after SSE scalar fp instructions +// like addss or mulss. +// +// For example, given the following code: +// __m128 foo(__m128 A, __m128 B) { +// A[0] += B[0]; +// return A; +// } +// +// Previously we generated: +// addss %xmm0, %xmm1 +// movss %xmm1, %xmm0 +// +// We now generate: +// addss %xmm1, %xmm0 +// +// (2) a vector packed single/double fp operation followed by a vector insert +// +// The effect is that the backend converts the packed fp instruction +// followed by a vector insert into a single SSE scalar fp instruction. +// +// For example, given the following code: +// __m128 foo(__m128 A, __m128 B) { +// __m128 C = A + B; +// return (__m128) {c[0], a[1], a[2], a[3]}; +// } +// +// Previously we generated: +// addps %xmm0, %xmm1 +// movss %xmm1, %xmm0 +// +// We now generate: +// addss %xmm1, %xmm0 + +// TODO: Some canonicalization in lowering would simplify the number of +// patterns we have to try to match. +multiclass scalar_math_patterns<SDNode Op, string OpcPrefix, SDNode Move, + ValueType VT, ValueType EltTy, + RegisterClass RC, Predicate BasePredicate> { + let Predicates = [BasePredicate] in { + // extracted scalar math op with insert via movss/movsd + def : Pat<(VT (Move (VT VR128:$dst), + (VT (scalar_to_vector + (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), + RC:$src))))), + (!cast<Instruction>(OpcPrefix#rr_Int) VT:$dst, + (VT (COPY_TO_REGCLASS RC:$src, VR128)))>; + } + + // Repeat for AVX versions of the instructions. + let Predicates = [UseAVX] in { + // extracted scalar math op with insert via movss/movsd + def : Pat<(VT (Move (VT VR128:$dst), + (VT (scalar_to_vector + (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), + RC:$src))))), + (!cast<Instruction>("V"#OpcPrefix#rr_Int) VT:$dst, + (VT (COPY_TO_REGCLASS RC:$src, VR128)))>; + } +} + +defm : scalar_math_patterns<fadd, "ADDSS", X86Movss, v4f32, f32, FR32, UseSSE1>; +defm : scalar_math_patterns<fsub, "SUBSS", X86Movss, v4f32, f32, FR32, UseSSE1>; +defm : scalar_math_patterns<fmul, "MULSS", X86Movss, v4f32, f32, FR32, UseSSE1>; +defm : scalar_math_patterns<fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, UseSSE1>; + +defm : scalar_math_patterns<fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, UseSSE2>; +defm : scalar_math_patterns<fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, UseSSE2>; +defm : scalar_math_patterns<fmul, "MULSD", X86Movsd, v2f64, f64, FR64, UseSSE2>; +defm : scalar_math_patterns<fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, UseSSE2>; + +/// Unop Arithmetic +/// In addition, we also have a special variant of the scalar form here to +/// represent the associated intrinsic operation. This form is unlike the +/// plain scalar form, in that it takes an entire vector (instead of a +/// scalar) and leaves the top elements undefined. +/// +/// And, we have a special variant form for a full-vector intrinsic form. + +/// sse_fp_unop_s - SSE1 unops in scalar form +/// For the non-AVX defs, we need $src1 to be tied to $dst because +/// the HW instructions are 2 operand / destructive. +multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, + ValueType ScalarVT, X86MemOperand x86memop, + Operand intmemop, SDNode OpNode, Domain d, + X86FoldableSchedWrite sched, Predicate target> { + let hasSideEffects = 0 in { + def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1), + !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"), + [(set RC:$dst, (OpNode RC:$src1))], d>, Sched<[sched]>, + Requires<[target]>; + let mayLoad = 1 in + def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1), + !strconcat(OpcodeStr, "\t{$src1, $dst|$dst, $src1}"), + [(set RC:$dst, (OpNode (load addr:$src1)))], d>, + Sched<[sched.Folded, ReadAfterLd]>, + Requires<[target, OptForSize]>; + + let isCodeGenOnly = 1, Constraints = "$src1 = $dst", ExeDomain = d in { + def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>, + Sched<[sched]>; + let mayLoad = 1 in + def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, intmemop:$src2), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>, + Sched<[sched.Folded, ReadAfterLd]>; + } + } + +} + +multiclass sse_fp_unop_s_intr<RegisterClass RC, ValueType vt, + ComplexPattern int_cpat, Intrinsic Intr, + Predicate target, string Suffix> { + let Predicates = [target] in { + // These are unary operations, but they are modeled as having 2 source operands + // because the high elements of the destination are unchanged in SSE. + def : Pat<(Intr VR128:$src), + (!cast<Instruction>(NAME#r_Int) VR128:$src, VR128:$src)>; + } + // We don't want to fold scalar loads into these instructions unless + // optimizing for size. This is because the folded instruction will have a + // partial register update, while the unfolded sequence will not, e.g. + // movss mem, %xmm0 + // rcpss %xmm0, %xmm0 + // which has a clobber before the rcp, vs. + // rcpss mem, %xmm0 + let Predicates = [target, OptForSize] in { + def : Pat<(Intr int_cpat:$src2), + (!cast<Instruction>(NAME#m_Int) + (vt (IMPLICIT_DEF)), addr:$src2)>; + } +} + +multiclass avx_fp_unop_s_intr<RegisterClass RC, ValueType vt, ComplexPattern int_cpat, + Intrinsic Intr, Predicate target> { + let Predicates = [target] in { + def : Pat<(Intr VR128:$src), + (!cast<Instruction>(NAME#r_Int) VR128:$src, + VR128:$src)>; + } + let Predicates = [target, OptForSize] in { + def : Pat<(Intr int_cpat:$src2), + (!cast<Instruction>(NAME#m_Int) + (vt (IMPLICIT_DEF)), addr:$src2)>; + } +} + +multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, + ValueType ScalarVT, X86MemOperand x86memop, + Operand intmemop, SDNode OpNode, Domain d, + X86FoldableSchedWrite sched, Predicate target> { + let hasSideEffects = 0 in { + def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [], d>, Sched<[sched]>; + let mayLoad = 1 in + def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [], d>, Sched<[sched.Folded, ReadAfterLd]>; + let isCodeGenOnly = 1, ExeDomain = d in { + def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, Sched<[sched]>; + let mayLoad = 1 in + def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, intmemop:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, Sched<[sched.Folded, ReadAfterLd]>; + } + } + + // We don't want to fold scalar loads into these instructions unless + // optimizing for size. This is because the folded instruction will have a + // partial register update, while the unfolded sequence will not, e.g. + // vmovss mem, %xmm0 + // vrcpss %xmm0, %xmm0, %xmm0 + // which has a clobber before the rcp, vs. + // vrcpss mem, %xmm0, %xmm0 + // TODO: In theory, we could fold the load, and avoid the stall caused by + // the partial register store, either in BreakFalseDeps or with smarter RA. + let Predicates = [target] in { + def : Pat<(OpNode RC:$src), (!cast<Instruction>(NAME#r) + (ScalarVT (IMPLICIT_DEF)), RC:$src)>; + } + let Predicates = [target, OptForSize] in { + def : Pat<(ScalarVT (OpNode (load addr:$src))), + (!cast<Instruction>(NAME#m) (ScalarVT (IMPLICIT_DEF)), + addr:$src)>; + } +} + +/// sse1_fp_unop_p - SSE1 unops in packed form. +multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86SchedWriteWidths sched, list<Predicate> prds> { +let Predicates = prds in { + def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat("v", OpcodeStr, + "ps\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>, + VEX, Sched<[sched.XMM]>, VEX_WIG; + def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + !strconcat("v", OpcodeStr, + "ps\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))]>, + VEX, Sched<[sched.XMM.Folded]>, VEX_WIG; + def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + !strconcat("v", OpcodeStr, + "ps\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))]>, + VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG; + def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), + !strconcat("v", OpcodeStr, + "ps\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))]>, + VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG; +} + + def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]>, + Sched<[sched.XMM]>; + def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>, + Sched<[sched.XMM.Folded]>; +} + +/// sse2_fp_unop_p - SSE2 unops in vector forms. +multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr, + SDNode OpNode, X86SchedWriteWidths sched> { +let Predicates = [HasAVX, NoVLX] in { + def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat("v", OpcodeStr, + "pd\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>, + VEX, Sched<[sched.XMM]>, VEX_WIG; + def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + !strconcat("v", OpcodeStr, + "pd\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))]>, + VEX, Sched<[sched.XMM.Folded]>, VEX_WIG; + def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + !strconcat("v", OpcodeStr, + "pd\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))]>, + VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG; + def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), + !strconcat("v", OpcodeStr, + "pd\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))]>, + VEX, VEX_L, Sched<[sched.YMM.Folded]>, VEX_WIG; +} + + def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]>, + Sched<[sched.XMM]>; + def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>, + Sched<[sched.XMM.Folded]>; +} + +multiclass sse1_fp_unop_s_intr<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86SchedWriteWidths sched, Predicate AVXTarget> { + defm SS : sse_fp_unop_s_intr<FR32, v4f32, sse_load_f32, + !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), + UseSSE1, "SS">, XS; + defm V#NAME#SS : avx_fp_unop_s_intr<FR32, v4f32, sse_load_f32, + !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), + AVXTarget>, + XS, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable; +} + +multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86SchedWriteWidths sched, Predicate AVXTarget> { + defm SS : sse_fp_unop_s<opc, OpcodeStr##ss, FR32, f32, f32mem, + ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS; + defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, f32, + f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>, + XS, VEX_4V, VEX_LIG, VEX_WIG; +} + +multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86SchedWriteWidths sched, Predicate AVXTarget> { + defm SD : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, f64, f64mem, + sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD; + defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, f64, + f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>, + XD, VEX_4V, VEX_LIG, VEX_WIG; +} + +// Square root. +defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SchedWriteFSqrt, UseAVX>, + sse1_fp_unop_p<0x51, "sqrt", fsqrt, SchedWriteFSqrt, [HasAVX, NoVLX]>, + sse2_fp_unop_s<0x51, "sqrt", fsqrt, SchedWriteFSqrt64, UseAVX>, + sse2_fp_unop_p<0x51, "sqrt", fsqrt, SchedWriteFSqrt64>; + +// Reciprocal approximations. Note that these typically require refinement +// in order to obtain suitable precision. +defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>, + sse1_fp_unop_s_intr<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, HasAVX>, + sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SchedWriteFRsqrt, [HasAVX]>; +defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>, + sse1_fp_unop_s_intr<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>, + sse1_fp_unop_p<0x53, "rcp", X86frcp, SchedWriteFRcp, [HasAVX]>; + +// There is no f64 version of the reciprocal approximation instructions. + +multiclass scalar_unary_math_patterns<SDNode OpNode, string OpcPrefix, SDNode Move, + ValueType VT, Predicate BasePredicate> { + let Predicates = [BasePredicate] in { + def : Pat<(VT (Move VT:$dst, (scalar_to_vector + (OpNode (extractelt VT:$src, 0))))), + (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>; + } + + // Repeat for AVX versions of the instructions. + let Predicates = [UseAVX] in { + def : Pat<(VT (Move VT:$dst, (scalar_to_vector + (OpNode (extractelt VT:$src, 0))))), + (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>; + } +} + +multiclass scalar_unary_math_imm_patterns<SDNode OpNode, string OpcPrefix, SDNode Move, + ValueType VT, bits<8> ImmV, + Predicate BasePredicate> { + let Predicates = [BasePredicate] in { + def : Pat<(VT (Move VT:$dst, (scalar_to_vector + (OpNode (extractelt VT:$src, 0))))), + (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src, (i32 ImmV))>; + } + + // Repeat for AVX versions of the instructions. + let Predicates = [UseAVX] in { + def : Pat<(VT (Move VT:$dst, (scalar_to_vector + (OpNode (extractelt VT:$src, 0))))), + (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src, (i32 ImmV))>; + } +} + +defm : scalar_unary_math_patterns<fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>; +defm : scalar_unary_math_patterns<fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>; + +multiclass scalar_unary_math_intr_patterns<Intrinsic Intr, string OpcPrefix, + SDNode Move, ValueType VT, + Predicate BasePredicate> { + let Predicates = [BasePredicate] in { + def : Pat<(VT (Move VT:$dst, (Intr VT:$src))), + (!cast<Instruction>(OpcPrefix#r_Int) VT:$dst, VT:$src)>; + } + + // Repeat for AVX versions of the instructions. + let Predicates = [HasAVX] in { + def : Pat<(VT (Move VT:$dst, (Intr VT:$src))), + (!cast<Instruction>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>; + } +} + +defm : scalar_unary_math_intr_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss, + v4f32, UseSSE1>; +defm : scalar_unary_math_intr_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss, + v4f32, UseSSE1>; + + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Non-temporal stores +//===----------------------------------------------------------------------===// + +let AddedComplexity = 400 in { // Prefer non-temporal versions +let Predicates = [HasAVX, NoVLX] in { +let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in { +def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs), + (ins f128mem:$dst, VR128:$src), + "movntps\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v4f32 VR128:$src), + addr:$dst)]>, VEX, VEX_WIG; +def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs), + (ins f128mem:$dst, VR128:$src), + "movntpd\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v2f64 VR128:$src), + addr:$dst)]>, VEX, VEX_WIG; +} // SchedRW + +let SchedRW = [SchedWriteFMoveLSNT.YMM.MR] in { +def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs), + (ins f256mem:$dst, VR256:$src), + "movntps\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v8f32 VR256:$src), + addr:$dst)]>, VEX, VEX_L, VEX_WIG; +def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs), + (ins f256mem:$dst, VR256:$src), + "movntpd\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v4f64 VR256:$src), + addr:$dst)]>, VEX, VEX_L, VEX_WIG; +} // SchedRW + +let ExeDomain = SSEPackedInt in { +def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs), + (ins i128mem:$dst, VR128:$src), + "movntdq\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v2i64 VR128:$src), + addr:$dst)]>, VEX, VEX_WIG, + Sched<[SchedWriteVecMoveLSNT.XMM.MR]>; +def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs), + (ins i256mem:$dst, VR256:$src), + "movntdq\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v4i64 VR256:$src), + addr:$dst)]>, VEX, VEX_L, VEX_WIG, + Sched<[SchedWriteVecMoveLSNT.YMM.MR]>; +} // ExeDomain +} // Predicates + +let SchedRW = [SchedWriteFMoveLSNT.XMM.MR] in { +def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movntps\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>; +def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movntpd\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>; +} // SchedRW + +let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLSNT.XMM.MR] in +def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), + "movntdq\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)]>; + +let SchedRW = [WriteStoreNT] in { +// There is no AVX form for instructions below this point +def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), + "movnti{l}\t{$src, $dst|$dst, $src}", + [(nontemporalstore (i32 GR32:$src), addr:$dst)]>, + PS, Requires<[HasSSE2]>; +def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), + "movnti{q}\t{$src, $dst|$dst, $src}", + [(nontemporalstore (i64 GR64:$src), addr:$dst)]>, + PS, Requires<[HasSSE2]>; +} // SchedRW = [WriteStoreNT] + +let Predicates = [HasAVX, NoVLX] in { + def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst), + (VMOVNTDQYmr addr:$dst, VR256:$src)>; + def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst), + (VMOVNTDQYmr addr:$dst, VR256:$src)>; + def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst), + (VMOVNTDQYmr addr:$dst, VR256:$src)>; + + def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst), + (VMOVNTDQmr addr:$dst, VR128:$src)>; + def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst), + (VMOVNTDQmr addr:$dst, VR128:$src)>; + def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst), + (VMOVNTDQmr addr:$dst, VR128:$src)>; +} + +let Predicates = [UseSSE2] in { + def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst), + (MOVNTDQmr addr:$dst, VR128:$src)>; + def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst), + (MOVNTDQmr addr:$dst, VR128:$src)>; + def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst), + (MOVNTDQmr addr:$dst, VR128:$src)>; +} + +} // AddedComplexity + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Prefetch and memory fence +//===----------------------------------------------------------------------===// + +// Prefetch intrinsic. +let Predicates = [HasSSEPrefetch], SchedRW = [WriteLoad] in { +def PREFETCHT0 : I<0x18, MRM1m, (outs), (ins i8mem:$src), + "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))]>, TB; +def PREFETCHT1 : I<0x18, MRM2m, (outs), (ins i8mem:$src), + "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))]>, TB; +def PREFETCHT2 : I<0x18, MRM3m, (outs), (ins i8mem:$src), + "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))]>, TB; +def PREFETCHNTA : I<0x18, MRM0m, (outs), (ins i8mem:$src), + "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))]>, TB; +} + +// FIXME: How should flush instruction be modeled? +let SchedRW = [WriteLoad] in { +// Flush cache +def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src), + "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>, + PS, Requires<[HasSSE2]>; +} + +let SchedRW = [WriteNop] in { +// Pause. This "instruction" is encoded as "rep; nop", so even though it +// was introduced with SSE2, it's backward compatible. +def PAUSE : I<0x90, RawFrm, (outs), (ins), + "pause", [(int_x86_sse2_pause)]>, OBXS; +} + +let SchedRW = [WriteFence] in { +// Load, store, and memory fence +// TODO: As with mfence, we may want to ease the availablity of sfence/lfence +// to include any 64-bit target. +def SFENCE : I<0xAE, MRM_F8, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>, + PS, Requires<[HasSSE1]>; +def LFENCE : I<0xAE, MRM_E8, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>, + PS, Requires<[HasSSE2]>; +def MFENCE : I<0xAE, MRM_F0, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>, + PS, Requires<[HasMFence]>; +} // SchedRW + +def : Pat<(X86MFence), (MFENCE)>; + +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Load/Store XCSR register +//===----------------------------------------------------------------------===// + +def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src), + "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>, + VEX, Sched<[WriteLDMXCSR]>, VEX_WIG; +def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), + "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>, + VEX, Sched<[WriteSTMXCSR]>, VEX_WIG; + +def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src), + "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>, + TB, Sched<[WriteLDMXCSR]>; +def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst), + "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>, + TB, Sched<[WriteSTMXCSR]>; + +//===---------------------------------------------------------------------===// +// SSE2 - Move Aligned/Unaligned Packed Integer Instructions +//===---------------------------------------------------------------------===// + +let ExeDomain = SSEPackedInt in { // SSE integer instructions + +let hasSideEffects = 0 in { +def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "movdqa\t{$src, $dst|$dst, $src}", []>, + Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG; +def VMOVDQUrr : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "movdqu\t{$src, $dst|$dst, $src}", []>, + Sched<[SchedWriteVecMoveLS.XMM.RR]>, VEX, VEX_WIG; +def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + "movdqa\t{$src, $dst|$dst, $src}", []>, + Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG; +def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + "movdqu\t{$src, $dst|$dst, $src}", []>, + Sched<[SchedWriteVecMoveLS.YMM.RR]>, VEX, VEX_L, VEX_WIG; +} + +// For Disassembler +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { +def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), + "movdqa\t{$src, $dst|$dst, $src}", []>, + Sched<[SchedWriteVecMoveLS.XMM.RR]>, + VEX, VEX_WIG, FoldGenData<"VMOVDQArr">; +def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), + "movdqa\t{$src, $dst|$dst, $src}", []>, + Sched<[SchedWriteVecMoveLS.YMM.RR]>, + VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQAYrr">; +def VMOVDQUrr_REV : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), + "movdqu\t{$src, $dst|$dst, $src}", []>, + Sched<[SchedWriteVecMoveLS.XMM.RR]>, + VEX, VEX_WIG, FoldGenData<"VMOVDQUrr">; +def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), + "movdqu\t{$src, $dst|$dst, $src}", []>, + Sched<[SchedWriteVecMoveLS.YMM.RR]>, + VEX, VEX_L, VEX_WIG, FoldGenData<"VMOVDQUYrr">; +} + +let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, + hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in { +def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "movdqa\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (alignedloadv2i64 addr:$src))]>, + Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG; +def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), + "movdqa\t{$src, $dst|$dst, $src}", []>, + Sched<[SchedWriteVecMoveLS.YMM.RM]>, + VEX, VEX_L, VEX_WIG; +def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "vmovdqu\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (loadv2i64 addr:$src))]>, + Sched<[SchedWriteVecMoveLS.XMM.RM]>, + XS, VEX, VEX_WIG; +def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), + "vmovdqu\t{$src, $dst|$dst, $src}", []>, + Sched<[SchedWriteVecMoveLS.YMM.RM]>, + XS, VEX, VEX_L, VEX_WIG; +} + +let mayStore = 1, hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in { +def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs), + (ins i128mem:$dst, VR128:$src), + "movdqa\t{$src, $dst|$dst, $src}", + [(alignedstore (v2i64 VR128:$src), addr:$dst)]>, + Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_WIG; +def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs), + (ins i256mem:$dst, VR256:$src), + "movdqa\t{$src, $dst|$dst, $src}", []>, + Sched<[SchedWriteVecMoveLS.YMM.MR]>, VEX, VEX_L, VEX_WIG; +def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), + "vmovdqu\t{$src, $dst|$dst, $src}", + [(store (v2i64 VR128:$src), addr:$dst)]>, + Sched<[SchedWriteVecMoveLS.XMM.MR]>, XS, VEX, VEX_WIG; +def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src), + "vmovdqu\t{$src, $dst|$dst, $src}",[]>, + Sched<[SchedWriteVecMoveLS.YMM.MR]>, XS, VEX, VEX_L, VEX_WIG; +} + +let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in { +let hasSideEffects = 0 in { +def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "movdqa\t{$src, $dst|$dst, $src}", []>; + +def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "movdqu\t{$src, $dst|$dst, $src}", []>, + XS, Requires<[UseSSE2]>; +} + +// For Disassembler +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { +def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), + "movdqa\t{$src, $dst|$dst, $src}", []>, + FoldGenData<"MOVDQArr">; + +def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), + "movdqu\t{$src, $dst|$dst, $src}", []>, + XS, Requires<[UseSSE2]>, FoldGenData<"MOVDQUrr">; +} +} // SchedRW + +let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, + hasSideEffects = 0, SchedRW = [SchedWriteVecMoveLS.XMM.RM] in { +def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "movdqa\t{$src, $dst|$dst, $src}", + [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>; +def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "movdqu\t{$src, $dst|$dst, $src}", + [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>, + XS, Requires<[UseSSE2]>; +} + +let mayStore = 1, hasSideEffects = 0, + SchedRW = [SchedWriteVecMoveLS.XMM.MR] in { +def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), + "movdqa\t{$src, $dst|$dst, $src}", + [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>; +def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), + "movdqu\t{$src, $dst|$dst, $src}", + [/*(store (v2i64 VR128:$src), addr:$dst)*/]>, + XS, Requires<[UseSSE2]>; +} + +} // ExeDomain = SSEPackedInt + +// Aliases to help the assembler pick two byte VEX encodings by swapping the +// operands relative to the normal instructions to use VEX.R instead of VEX.B. +def : InstAlias<"vmovdqa\t{$src, $dst|$dst, $src}", + (VMOVDQArr_REV VR128L:$dst, VR128H:$src), 0>; +def : InstAlias<"vmovdqa\t{$src, $dst|$dst, $src}", + (VMOVDQAYrr_REV VR256L:$dst, VR256H:$src), 0>; +def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}", + (VMOVDQUrr_REV VR128L:$dst, VR128H:$src), 0>; +def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}", + (VMOVDQUYrr_REV VR256L:$dst, VR256H:$src), 0>; + +// Reversed version with ".s" suffix for GAS compatibility. +def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}", + (VMOVDQArr_REV VR128:$dst, VR128:$src), 0>; +def : InstAlias<"vmovdqa.s\t{$src, $dst|$dst, $src}", + (VMOVDQAYrr_REV VR256:$dst, VR256:$src), 0>; +def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}", + (VMOVDQUrr_REV VR128:$dst, VR128:$src), 0>; +def : InstAlias<"vmovdqu.s\t{$src, $dst|$dst, $src}", + (VMOVDQUYrr_REV VR256:$dst, VR256:$src), 0>; + +// Reversed version with ".s" suffix for GAS compatibility. +def : InstAlias<"movdqa.s\t{$src, $dst|$dst, $src}", + (MOVDQArr_REV VR128:$dst, VR128:$src), 0>; +def : InstAlias<"movdqu.s\t{$src, $dst|$dst, $src}", + (MOVDQUrr_REV VR128:$dst, VR128:$src), 0>; + +let Predicates = [HasAVX, NoVLX] in { + // Additional patterns for other integer sizes. + def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst), + (VMOVDQAmr addr:$dst, VR128:$src)>; + def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst), + (VMOVDQAmr addr:$dst, VR128:$src)>; + def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst), + (VMOVDQAmr addr:$dst, VR128:$src)>; + def : Pat<(store (v4i32 VR128:$src), addr:$dst), + (VMOVDQUmr addr:$dst, VR128:$src)>; + def : Pat<(store (v8i16 VR128:$src), addr:$dst), + (VMOVDQUmr addr:$dst, VR128:$src)>; + def : Pat<(store (v16i8 VR128:$src), addr:$dst), + (VMOVDQUmr addr:$dst, VR128:$src)>; +} + +//===---------------------------------------------------------------------===// +// SSE2 - Packed Integer Arithmetic Instructions +//===---------------------------------------------------------------------===// + +let ExeDomain = SSEPackedInt in { // SSE integer instructions + +/// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types +multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType DstVT, ValueType SrcVT, RegisterClass RC, + PatFrag memop_frag, X86MemOperand x86memop, + X86FoldableSchedWrite sched, bit Is2Addr = 1> { + let isCommutable = 1 in + def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>, + Sched<[sched]>; + def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), + (bitconvert (memop_frag addr:$src2)))))]>, + Sched<[sched.Folded, ReadAfterLd]>; +} +} // ExeDomain = SSEPackedInt + +defm PADDB : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8, + SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; +defm PADDW : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16, + SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; +defm PADDD : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32, + SchedWriteVecALU, 1, NoVLX>; +defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64, + SchedWriteVecALU, 1, NoVLX>; +defm PADDSB : PDI_binop_all<0xEC, "paddsb", X86adds, v16i8, v32i8, + SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; +defm PADDSW : PDI_binop_all<0xED, "paddsw", X86adds, v8i16, v16i16, + SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; +defm PADDUSB : PDI_binop_all<0xDC, "paddusb", X86addus, v16i8, v32i8, + SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; +defm PADDUSW : PDI_binop_all<0xDD, "paddusw", X86addus, v8i16, v16i16, + SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; +defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16, + SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>; +defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16, + SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>; +defm PMULHW : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16, + SchedWriteVecIMul, 1, NoVLX_Or_NoBWI>; +defm PSUBB : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8, + SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; +defm PSUBW : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16, + SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; +defm PSUBD : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32, + SchedWriteVecALU, 0, NoVLX>; +defm PSUBQ : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64, + SchedWriteVecALU, 0, NoVLX>; +defm PSUBSB : PDI_binop_all<0xE8, "psubsb", X86subs, v16i8, v32i8, + SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; +defm PSUBSW : PDI_binop_all<0xE9, "psubsw", X86subs, v8i16, v16i16, + SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; +defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", X86subus, v16i8, v32i8, + SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; +defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", X86subus, v8i16, v16i16, + SchedWriteVecALU, 0, NoVLX_Or_NoBWI>; +defm PMINUB : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8, + SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; +defm PMINSW : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16, + SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; +defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8, + SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; +defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16, + SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; +defm PAVGB : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8, + SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; +defm PAVGW : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16, + SchedWriteVecALU, 1, NoVLX_Or_NoBWI>; +defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64, + SchedWriteVecIMul, 1, NoVLX>; + +let Predicates = [HasAVX, NoVLX_Or_NoBWI] in +defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128, + loadv2i64, i128mem, SchedWriteVecIMul.XMM, 0>, + VEX_4V, VEX_WIG; + +let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in +defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16, + VR256, loadv4i64, i256mem, SchedWriteVecIMul.YMM, + 0>, VEX_4V, VEX_L, VEX_WIG; +let Constraints = "$src1 = $dst" in +defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128, + memopv2i64, i128mem, SchedWriteVecIMul.XMM>; + +let Predicates = [HasAVX, NoVLX_Or_NoBWI] in +defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128, + loadv2i64, i128mem, SchedWritePSADBW.XMM, 0>, + VEX_4V, VEX_WIG; +let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in +defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256, + loadv4i64, i256mem, SchedWritePSADBW.YMM, 0>, + VEX_4V, VEX_L, VEX_WIG; +let Constraints = "$src1 = $dst" in +defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128, + memopv2i64, i128mem, SchedWritePSADBW.XMM>; + +//===---------------------------------------------------------------------===// +// SSE2 - Packed Integer Logical Instructions +//===---------------------------------------------------------------------===// + +multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm, + string OpcodeStr, SDNode OpNode, + SDNode OpNode2, RegisterClass RC, + X86FoldableSchedWrite sched, + X86FoldableSchedWrite schedImm, + ValueType DstVT, ValueType SrcVT, + PatFrag ld_frag, bit Is2Addr = 1> { + // src2 is always 128-bit + def rr : PDI<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, VR128:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))]>, + Sched<[sched]>; + def rm : PDI<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, i128mem:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (DstVT (OpNode RC:$src1, + (SrcVT (bitconvert (ld_frag addr:$src2))))))]>, + Sched<[sched.Folded, ReadAfterLd]>; + def ri : PDIi8<opc2, ImmForm, (outs RC:$dst), + (ins RC:$src1, u8imm:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))]>, + Sched<[schedImm]>; +} + +multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm, + string OpcodeStr, SDNode OpNode, + SDNode OpNode2, ValueType DstVT128, + ValueType DstVT256, ValueType SrcVT, + X86SchedWriteWidths sched, + X86SchedWriteWidths schedImm, Predicate prd> { +let Predicates = [HasAVX, prd] in + defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr), + OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM, + DstVT128, SrcVT, loadv2i64, 0>, VEX_4V, VEX_WIG; +let Predicates = [HasAVX2, prd] in + defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr), + OpNode, OpNode2, VR256, sched.YMM, schedImm.YMM, + DstVT256, SrcVT, loadv2i64, 0>, VEX_4V, VEX_L, + VEX_WIG; +let Constraints = "$src1 = $dst" in + defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2, + VR128, sched.XMM, schedImm.XMM, DstVT128, SrcVT, + memopv2i64>; +} + +multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr, + SDNode OpNode, RegisterClass RC, ValueType VT, + X86FoldableSchedWrite sched, bit Is2Addr = 1> { + def ri : PDIi8<opc, ImmForm, (outs RC:$dst), (ins RC:$src1, u8imm:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (VT (OpNode RC:$src1, (i8 imm:$src2))))]>, + Sched<[sched]>; +} + +multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr, + SDNode OpNode, X86SchedWriteWidths sched> { +let Predicates = [HasAVX, NoVLX_Or_NoBWI] in + defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode, + VR128, v16i8, sched.XMM, 0>, VEX_4V, VEX_WIG; +let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in + defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode, + VR256, v32i8, sched.YMM, 0>, + VEX_4V, VEX_L, VEX_WIG; +let Constraints = "$src1 = $dst" in + defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8, + sched.XMM>; +} + +let ExeDomain = SSEPackedInt in { + defm PSLLW : PDI_binop_rmi_all<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli, + v8i16, v16i16, v8i16, SchedWriteVecShift, + SchedWriteVecShiftImm, NoVLX_Or_NoBWI>; + defm PSLLD : PDI_binop_rmi_all<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli, + v4i32, v8i32, v4i32, SchedWriteVecShift, + SchedWriteVecShiftImm, NoVLX>; + defm PSLLQ : PDI_binop_rmi_all<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli, + v2i64, v4i64, v2i64, SchedWriteVecShift, + SchedWriteVecShiftImm, NoVLX>; + + defm PSRLW : PDI_binop_rmi_all<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli, + v8i16, v16i16, v8i16, SchedWriteVecShift, + SchedWriteVecShiftImm, NoVLX_Or_NoBWI>; + defm PSRLD : PDI_binop_rmi_all<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli, + v4i32, v8i32, v4i32, SchedWriteVecShift, + SchedWriteVecShiftImm, NoVLX>; + defm PSRLQ : PDI_binop_rmi_all<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli, + v2i64, v4i64, v2i64, SchedWriteVecShift, + SchedWriteVecShiftImm, NoVLX>; + + defm PSRAW : PDI_binop_rmi_all<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai, + v8i16, v16i16, v8i16, SchedWriteVecShift, + SchedWriteVecShiftImm, NoVLX_Or_NoBWI>; + defm PSRAD : PDI_binop_rmi_all<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai, + v4i32, v8i32, v4i32, SchedWriteVecShift, + SchedWriteVecShiftImm, NoVLX>; + + defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq, + SchedWriteShuffle>; + defm PSRLDQ : PDI_binop_ri_all<0x73, MRM3r, "psrldq", X86vshrdq, + SchedWriteShuffle>; +} // ExeDomain = SSEPackedInt + +//===---------------------------------------------------------------------===// +// SSE2 - Packed Integer Comparison Instructions +//===---------------------------------------------------------------------===// + +defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8, + SchedWriteVecALU, 1, TruePredicate>; +defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16, + SchedWriteVecALU, 1, TruePredicate>; +defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32, + SchedWriteVecALU, 1, TruePredicate>; +defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8, + SchedWriteVecALU, 0, TruePredicate>; +defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16, + SchedWriteVecALU, 0, TruePredicate>; +defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32, + SchedWriteVecALU, 0, TruePredicate>; + +//===---------------------------------------------------------------------===// +// SSE2 - Packed Integer Shuffle Instructions +//===---------------------------------------------------------------------===// + +let ExeDomain = SSEPackedInt in { +multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256, + SDNode OpNode, X86SchedWriteWidths sched, + Predicate prd> { +let Predicates = [HasAVX, prd] in { + def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, u8imm:$src2), + !strconcat("v", OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))]>, + VEX, Sched<[sched.XMM]>, VEX_WIG; + def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst), + (ins i128mem:$src1, u8imm:$src2), + !strconcat("v", OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (vt128 (OpNode (bitconvert (loadv2i64 addr:$src1)), + (i8 imm:$src2))))]>, VEX, + Sched<[sched.XMM.Folded]>, VEX_WIG; +} + +let Predicates = [HasAVX2, prd] in { + def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, u8imm:$src2), + !strconcat("v", OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))]>, + VEX, VEX_L, Sched<[sched.YMM]>, VEX_WIG; + def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst), + (ins i256mem:$src1, u8imm:$src2), + !strconcat("v", OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (vt256 (OpNode (bitconvert (loadv4i64 addr:$src1)), + (i8 imm:$src2))))]>, VEX, VEX_L, + Sched<[sched.YMM.Folded]>, VEX_WIG; +} + +let Predicates = [UseSSE2] in { + def ri : Ii8<0x70, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))]>, + Sched<[sched.XMM]>; + def mi : Ii8<0x70, MRMSrcMem, + (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)), + (i8 imm:$src2))))]>, + Sched<[sched.XMM.Folded]>; +} +} +} // ExeDomain = SSEPackedInt + +defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd, + SchedWriteShuffle, NoVLX>, PD; +defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw, + SchedWriteShuffle, NoVLX_Or_NoBWI>, XS; +defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw, + SchedWriteShuffle, NoVLX_Or_NoBWI>, XD; + +//===---------------------------------------------------------------------===// +// Packed Integer Pack Instructions (SSE & AVX) +//===---------------------------------------------------------------------===// + +let ExeDomain = SSEPackedInt in { +multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, + ValueType ArgVT, SDNode OpNode, RegisterClass RC, + X86MemOperand x86memop, X86FoldableSchedWrite sched, + PatFrag ld_frag, bit Is2Addr = 1> { + def rr : PDI<opc, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, + (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>, + Sched<[sched]>; + def rm : PDI<opc, MRMSrcMem, + (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, + (OutVT (OpNode (ArgVT RC:$src1), + (bitconvert (ld_frag addr:$src2)))))]>, + Sched<[sched.Folded, ReadAfterLd]>; +} + +multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, + ValueType ArgVT, SDNode OpNode, RegisterClass RC, + X86MemOperand x86memop, X86FoldableSchedWrite sched, + PatFrag ld_frag, bit Is2Addr = 1> { + def rr : SS48I<opc, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, + (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))]>, + Sched<[sched]>; + def rm : SS48I<opc, MRMSrcMem, + (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, + (OutVT (OpNode (ArgVT RC:$src1), + (bitconvert (ld_frag addr:$src2)))))]>, + Sched<[sched.Folded, ReadAfterLd]>; +} + +let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { + defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128, + i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, + VEX_4V, VEX_WIG; + defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128, + i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, + VEX_4V, VEX_WIG; + + defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128, + i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, + VEX_4V, VEX_WIG; + defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128, + i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, + VEX_4V; +} + +let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { + defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, VR256, + i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, + VEX_4V, VEX_L, VEX_WIG; + defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, VR256, + i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, + VEX_4V, VEX_L, VEX_WIG; + + defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, VR256, + i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, + VEX_4V, VEX_L, VEX_WIG; + defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256, + i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, + VEX_4V, VEX_L; +} + +let Constraints = "$src1 = $dst" in { + defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, VR128, + i128mem, SchedWriteShuffle.XMM, memopv2i64>; + defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, VR128, + i128mem, SchedWriteShuffle.XMM, memopv2i64>; + + defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, VR128, + i128mem, SchedWriteShuffle.XMM, memopv2i64>; + + defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, VR128, + i128mem, SchedWriteShuffle.XMM, memopv2i64>; +} +} // ExeDomain = SSEPackedInt + +//===---------------------------------------------------------------------===// +// SSE2 - Packed Integer Unpack Instructions +//===---------------------------------------------------------------------===// + +let ExeDomain = SSEPackedInt in { +multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt, + SDNode OpNode, RegisterClass RC, X86MemOperand x86memop, + X86FoldableSchedWrite sched, PatFrag ld_frag, + bit Is2Addr = 1> { + def rr : PDI<opc, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>, + Sched<[sched]>; + def rm : PDI<opc, MRMSrcMem, + (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (vt (OpNode RC:$src1, + (bitconvert (ld_frag addr:$src2)))))]>, + Sched<[sched.Folded, ReadAfterLd]>; +} + +let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { + defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128, + i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, + VEX_4V, VEX_WIG; + defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128, + i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, + VEX_4V, VEX_WIG; + defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128, + i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, + VEX_4V, VEX_WIG; + defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128, + i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, + VEX_4V, VEX_WIG; +} + +let Predicates = [HasAVX, NoVLX] in { + defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128, + i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, + VEX_4V, VEX_WIG; + defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128, + i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, + VEX_4V, VEX_WIG; + defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128, + i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, + VEX_4V, VEX_WIG; + defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128, + i128mem, SchedWriteShuffle.XMM, loadv2i64, 0>, + VEX_4V, VEX_WIG; +} + +let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { + defm VPUNPCKLBWY : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256, + i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, + VEX_4V, VEX_L, VEX_WIG; + defm VPUNPCKLWDY : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256, + i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, + VEX_4V, VEX_L, VEX_WIG; + defm VPUNPCKHBWY : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256, + i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, + VEX_4V, VEX_L, VEX_WIG; + defm VPUNPCKHWDY : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256, + i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, + VEX_4V, VEX_L, VEX_WIG; +} + +let Predicates = [HasAVX2, NoVLX] in { + defm VPUNPCKLDQY : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256, + i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, + VEX_4V, VEX_L, VEX_WIG; + defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256, + i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, + VEX_4V, VEX_L, VEX_WIG; + defm VPUNPCKHDQY : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256, + i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, + VEX_4V, VEX_L, VEX_WIG; + defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256, + i256mem, SchedWriteShuffle.YMM, loadv4i64, 0>, + VEX_4V, VEX_L, VEX_WIG; +} + +let Constraints = "$src1 = $dst" in { + defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, VR128, + i128mem, SchedWriteShuffle.XMM, memopv2i64>; + defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, VR128, + i128mem, SchedWriteShuffle.XMM, memopv2i64>; + defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, VR128, + i128mem, SchedWriteShuffle.XMM, memopv2i64>; + defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, VR128, + i128mem, SchedWriteShuffle.XMM, memopv2i64>; + + defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, VR128, + i128mem, SchedWriteShuffle.XMM, memopv2i64>; + defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, VR128, + i128mem, SchedWriteShuffle.XMM, memopv2i64>; + defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, VR128, + i128mem, SchedWriteShuffle.XMM, memopv2i64>; + defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, VR128, + i128mem, SchedWriteShuffle.XMM, memopv2i64>; +} +} // ExeDomain = SSEPackedInt + +//===---------------------------------------------------------------------===// +// SSE2 - Packed Integer Extract and Insert +//===---------------------------------------------------------------------===// + +let ExeDomain = SSEPackedInt in { +multiclass sse2_pinsrw<bit Is2Addr = 1> { + def rr : Ii8<0xC4, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, + GR32orGR64:$src2, u8imm:$src3), + !if(Is2Addr, + "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", + "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))]>, + Sched<[WriteVecInsert]>; + def rm : Ii8<0xC4, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, + i16mem:$src2, u8imm:$src3), + !if(Is2Addr, + "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}", + "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (X86pinsrw VR128:$src1, (extloadi16 addr:$src2), + imm:$src3))]>, + Sched<[WriteVecInsertLd, ReadAfterLd]>; +} + +// Extract +let Predicates = [HasAVX, NoBWI] in +def VPEXTRWrr : Ii8<0xC5, MRMSrcReg, + (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2), + "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1), + imm:$src2))]>, + PD, VEX, Sched<[WriteVecExtract]>; +def PEXTRWrr : PDIi8<0xC5, MRMSrcReg, + (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2), + "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1), + imm:$src2))]>, + Sched<[WriteVecExtract]>; + +// Insert +let Predicates = [HasAVX, NoBWI] in +defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V; + +let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in +defm PINSRW : sse2_pinsrw, PD; + +} // ExeDomain = SSEPackedInt + +//===---------------------------------------------------------------------===// +// SSE2 - Packed Mask Creation +//===---------------------------------------------------------------------===// + +let ExeDomain = SSEPackedInt in { + +def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), + (ins VR128:$src), + "pmovmskb\t{$src, $dst|$dst, $src}", + [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>, + Sched<[WriteVecMOVMSK]>, VEX, VEX_WIG; + +let Predicates = [HasAVX2] in { +def VPMOVMSKBYrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), + (ins VR256:$src), + "pmovmskb\t{$src, $dst|$dst, $src}", + [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>, + Sched<[WriteVecMOVMSKY]>, VEX, VEX_L, VEX_WIG; +} + +def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src), + "pmovmskb\t{$src, $dst|$dst, $src}", + [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))]>, + Sched<[WriteVecMOVMSK]>; + +} // ExeDomain = SSEPackedInt + +//===---------------------------------------------------------------------===// +// SSE2 - Conditional Store +//===---------------------------------------------------------------------===// + +let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecMoveLS.XMM.MR] in { +let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in +def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs), + (ins VR128:$src, VR128:$mask), + "maskmovdqu\t{$mask, $src|$src, $mask}", + [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>, + VEX, VEX_WIG; +let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in +def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs), + (ins VR128:$src, VR128:$mask), + "maskmovdqu\t{$mask, $src|$src, $mask}", + [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>, + VEX, VEX_WIG; + +let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in +def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), + "maskmovdqu\t{$mask, $src|$src, $mask}", + [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>; +let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in +def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), + "maskmovdqu\t{$mask, $src|$src, $mask}", + [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>; + +} // ExeDomain = SSEPackedInt + +//===---------------------------------------------------------------------===// +// SSE2 - Move Doubleword/Quadword +//===---------------------------------------------------------------------===// + +//===---------------------------------------------------------------------===// +// Move Int Doubleword to Packed Double Int +// +let ExeDomain = SSEPackedInt in { +def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (scalar_to_vector GR32:$src)))]>, + VEX, Sched<[WriteVecMoveFromGpr]>; +def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>, + VEX, Sched<[WriteVecLoad]>; +def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), + "movq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2i64 (scalar_to_vector GR64:$src)))]>, + VEX, Sched<[WriteVecMoveFromGpr]>; +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in +def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), + "movq\t{$src, $dst|$dst, $src}", []>, + VEX, Sched<[WriteVecLoad]>; +let isCodeGenOnly = 1 in +def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), + "movq\t{$src, $dst|$dst, $src}", + [(set FR64:$dst, (bitconvert GR64:$src))]>, + VEX, Sched<[WriteVecMoveFromGpr]>; + +def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (scalar_to_vector GR32:$src)))]>, + Sched<[WriteVecMoveFromGpr]>; +def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>, + Sched<[WriteVecLoad]>; +def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), + "movq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2i64 (scalar_to_vector GR64:$src)))]>, + Sched<[WriteVecMoveFromGpr]>; +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in +def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), + "movq\t{$src, $dst|$dst, $src}", []>, + Sched<[WriteVecLoad]>; +let isCodeGenOnly = 1 in +def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), + "movq\t{$src, $dst|$dst, $src}", + [(set FR64:$dst, (bitconvert GR64:$src))]>, + Sched<[WriteVecMoveFromGpr]>; +} // ExeDomain = SSEPackedInt + +//===---------------------------------------------------------------------===// +// Move Int Doubleword to Single Scalar +// +let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { + def VMOVDI2SSrr : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set FR32:$dst, (bitconvert GR32:$src))]>, + VEX, Sched<[WriteVecMoveFromGpr]>; + + def VMOVDI2SSrm : VS2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>, + VEX, Sched<[WriteVecLoad]>; + def MOVDI2SSrr : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set FR32:$dst, (bitconvert GR32:$src))]>, + Sched<[WriteVecMoveFromGpr]>; + + def MOVDI2SSrm : S2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>, + Sched<[WriteVecLoad]>; +} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 + +//===---------------------------------------------------------------------===// +// Move Packed Doubleword Int to Packed Double Int +// +let ExeDomain = SSEPackedInt in { +def VMOVPDI2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (extractelt (v4i32 VR128:$src), + (iPTR 0)))]>, VEX, + Sched<[WriteVecMoveToGpr]>; +def VMOVPDI2DImr : VS2I<0x7E, MRMDestMem, (outs), + (ins i32mem:$dst, VR128:$src), + "movd\t{$src, $dst|$dst, $src}", + [(store (i32 (extractelt (v4i32 VR128:$src), + (iPTR 0))), addr:$dst)]>, + VEX, Sched<[WriteVecStore]>; +def MOVPDI2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (extractelt (v4i32 VR128:$src), + (iPTR 0)))]>, + Sched<[WriteVecMoveToGpr]>; +def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src), + "movd\t{$src, $dst|$dst, $src}", + [(store (i32 (extractelt (v4i32 VR128:$src), + (iPTR 0))), addr:$dst)]>, + Sched<[WriteVecStore]>; +} // ExeDomain = SSEPackedInt + +//===---------------------------------------------------------------------===// +// Move Packed Doubleword Int first element to Doubleword Int +// +let ExeDomain = SSEPackedInt in { +let SchedRW = [WriteVecMoveToGpr] in { +def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), + "movq\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (extractelt (v2i64 VR128:$src), + (iPTR 0)))]>, + VEX; + +def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), + "movq\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (extractelt (v2i64 VR128:$src), + (iPTR 0)))]>; +} //SchedRW + +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in +def VMOVPQIto64mr : VRS2I<0x7E, MRMDestMem, (outs), + (ins i64mem:$dst, VR128:$src), + "movq\t{$src, $dst|$dst, $src}", []>, + VEX, Sched<[WriteVecStore]>; +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in +def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), + "movq\t{$src, $dst|$dst, $src}", []>, + Sched<[WriteVecStore]>; +} // ExeDomain = SSEPackedInt + +//===---------------------------------------------------------------------===// +// Bitcast FR64 <-> GR64 +// +let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { + let Predicates = [UseAVX] in + def VMOV64toSDrm : VS2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src), + "movq\t{$src, $dst|$dst, $src}", + [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>, + VEX, Sched<[WriteVecLoad]>; + def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), + "movq\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (bitconvert FR64:$src))]>, + VEX, Sched<[WriteVecMoveToGpr]>; + def VMOVSDto64mr : VRS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src), + "movq\t{$src, $dst|$dst, $src}", + [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>, + VEX, Sched<[WriteVecStore]>; + + def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src), + "movq\t{$src, $dst|$dst, $src}", + [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>, + Sched<[WriteVecLoad]>; + def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), + "movq\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (bitconvert FR64:$src))]>, + Sched<[WriteVecMoveToGpr]>; + def MOVSDto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src), + "movq\t{$src, $dst|$dst, $src}", + [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>, + Sched<[WriteVecStore]>; +} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 + +//===---------------------------------------------------------------------===// +// Move Scalar Single to Double Int +// +let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { + def VMOVSS2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (bitconvert FR32:$src))]>, + VEX, Sched<[WriteVecMoveToGpr]>; + def VMOVSS2DImr : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>, + VEX, Sched<[WriteVecStore]>; + def MOVSS2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (bitconvert FR32:$src))]>, + Sched<[WriteVecMoveToGpr]>; + def MOVSS2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src), + "movd\t{$src, $dst|$dst, $src}", + [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>, + Sched<[WriteVecStore]>; +} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 + +let Predicates = [UseAVX] in { + def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), + (VMOVDI2PDIrr GR32:$src)>; + + def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), + (VMOV64toPQIrr GR64:$src)>; + + def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, + (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), + (SUBREG_TO_REG (i64 0), (v2i64 (VMOV64toPQIrr GR64:$src)), sub_xmm)>; + // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part. + // These instructions also write zeros in the high part of a 256-bit register. + def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))), + (VMOVDI2PDIrm addr:$src)>; + def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), + (VMOVDI2PDIrm addr:$src)>; + def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), + (VMOVDI2PDIrm addr:$src)>; + def : Pat<(v4i32 (X86vzload addr:$src)), + (VMOVDI2PDIrm addr:$src)>; + def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, + (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>; + def : Pat<(v8i32 (X86vzload addr:$src)), + (SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>; + // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext. + def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, + (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))), + (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIrr GR32:$src)), sub_xmm)>; +} + +let Predicates = [UseSSE2] in { + def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), + (MOVDI2PDIrr GR32:$src)>; + + def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), + (MOV64toPQIrr GR64:$src)>; + def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))), + (MOVDI2PDIrm addr:$src)>; + def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), + (MOVDI2PDIrm addr:$src)>; + def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))), + (MOVDI2PDIrm addr:$src)>; + def : Pat<(v4i32 (X86vzload addr:$src)), + (MOVDI2PDIrm addr:$src)>; +} + +// Before the MC layer of LLVM existed, clang emitted "movd" assembly instead of +// "movq" due to MacOS parsing limitation. In order to parse old assembly, we add +// these aliases. +def : InstAlias<"movd\t{$src, $dst|$dst, $src}", + (MOV64toPQIrr VR128:$dst, GR64:$src), 0>; +def : InstAlias<"movd\t{$src, $dst|$dst, $src}", + (MOVPQIto64rr GR64:$dst, VR128:$src), 0>; +// Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX. +def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}", + (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>; +def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}", + (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>; + +//===---------------------------------------------------------------------===// +// SSE2 - Move Quadword +//===---------------------------------------------------------------------===// + +//===---------------------------------------------------------------------===// +// Move Quadword Int to Packed Quadword Int +// + +let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLoad] in { +def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS, + VEX, Requires<[UseAVX]>, VEX_WIG; +def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), + "movq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, + (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, + XS, Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix +} // ExeDomain, SchedRW + +//===---------------------------------------------------------------------===// +// Move Packed Quadword Int to Quadword Int +// +let ExeDomain = SSEPackedInt, SchedRW = [WriteVecStore] in { +def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), + "movq\t{$src, $dst|$dst, $src}", + [(store (i64 (extractelt (v2i64 VR128:$src), + (iPTR 0))), addr:$dst)]>, + VEX, VEX_WIG; +def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), + "movq\t{$src, $dst|$dst, $src}", + [(store (i64 (extractelt (v2i64 VR128:$src), + (iPTR 0))), addr:$dst)]>; +} // ExeDomain, SchedRW + +// For disassembler only +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, + SchedRW = [SchedWriteVecLogic.XMM] in { +def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), + "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_WIG; +def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), + "movq\t{$src, $dst|$dst, $src}", []>; +} + +// Aliases to help the assembler pick two byte VEX encodings by swapping the +// operands relative to the normal instructions to use VEX.R instead of VEX.B. +def : InstAlias<"vmovq\t{$src, $dst|$dst, $src}", + (VMOVPQI2QIrr VR128L:$dst, VR128H:$src), 0>; + +def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}", + (VMOVPQI2QIrr VR128:$dst, VR128:$src), 0>; +def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}", + (MOVPQI2QIrr VR128:$dst, VR128:$src), 0>; + +let Predicates = [UseAVX] in { + def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), + (VMOVQI2PQIrm addr:$src)>; + def : Pat<(v2i64 (X86vzload addr:$src)), + (VMOVQI2PQIrm addr:$src)>; + def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, + (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))), + (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>; + def : Pat<(v4i64 (X86vzload addr:$src)), + (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>; +} + +let Predicates = [UseSSE2] in { + def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))), + (MOVQI2PQIrm addr:$src)>; + def : Pat<(v2i64 (X86vzload addr:$src)), (MOVQI2PQIrm addr:$src)>; +} + +//===---------------------------------------------------------------------===// +// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in +// IA32 document. movq xmm1, xmm2 does clear the high bits. +// +let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in { +def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>, + XS, VEX, Requires<[UseAVX]>, VEX_WIG; +def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "movq\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>, + XS, Requires<[UseSSE2]>; +} // ExeDomain, SchedRW + +let Predicates = [UseAVX] in { + def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), + (VMOVZPQILo2PQIrr VR128:$src)>; +} +let Predicates = [UseSSE2] in { + def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))), + (MOVZPQILo2PQIrr VR128:$src)>; +} + +//===---------------------------------------------------------------------===// +// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP +//===---------------------------------------------------------------------===// + +multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr, + ValueType vt, RegisterClass RC, PatFrag mem_frag, + X86MemOperand x86memop, X86FoldableSchedWrite sched> { +def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (vt (OpNode RC:$src)))]>, + Sched<[sched]>; +def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>, + Sched<[sched.Folded]>; +} + +let Predicates = [HasAVX, NoVLX] in { + defm VMOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup", + v4f32, VR128, loadv4f32, f128mem, + SchedWriteFShuffle.XMM>, VEX, VEX_WIG; + defm VMOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup", + v4f32, VR128, loadv4f32, f128mem, + SchedWriteFShuffle.XMM>, VEX, VEX_WIG; + defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup", + v8f32, VR256, loadv8f32, f256mem, + SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG; + defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup", + v8f32, VR256, loadv8f32, f256mem, + SchedWriteFShuffle.YMM>, VEX, VEX_L, VEX_WIG; +} +defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128, + memopv4f32, f128mem, SchedWriteFShuffle.XMM>; +defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128, + memopv4f32, f128mem, SchedWriteFShuffle.XMM>; + +let Predicates = [HasAVX, NoVLX] in { + def : Pat<(v4i32 (X86Movshdup VR128:$src)), + (VMOVSHDUPrr VR128:$src)>; + def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (loadv2i64 addr:$src)))), + (VMOVSHDUPrm addr:$src)>; + def : Pat<(v4i32 (X86Movsldup VR128:$src)), + (VMOVSLDUPrr VR128:$src)>; + def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (loadv2i64 addr:$src)))), + (VMOVSLDUPrm addr:$src)>; + def : Pat<(v8i32 (X86Movshdup VR256:$src)), + (VMOVSHDUPYrr VR256:$src)>; + def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (loadv4i64 addr:$src)))), + (VMOVSHDUPYrm addr:$src)>; + def : Pat<(v8i32 (X86Movsldup VR256:$src)), + (VMOVSLDUPYrr VR256:$src)>; + def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (loadv4i64 addr:$src)))), + (VMOVSLDUPYrm addr:$src)>; +} + +let Predicates = [UseSSE3] in { + def : Pat<(v4i32 (X86Movshdup VR128:$src)), + (MOVSHDUPrr VR128:$src)>; + def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))), + (MOVSHDUPrm addr:$src)>; + def : Pat<(v4i32 (X86Movsldup VR128:$src)), + (MOVSLDUPrr VR128:$src)>; + def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))), + (MOVSLDUPrm addr:$src)>; +} + +//===---------------------------------------------------------------------===// +// SSE3 - Replicate Double FP - MOVDDUP +//===---------------------------------------------------------------------===// + +multiclass sse3_replicate_dfp<string OpcodeStr, X86SchedWriteWidths sched> { +def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))]>, + Sched<[sched.XMM]>; +def rm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, + (v2f64 (X86Movddup + (scalar_to_vector (loadf64 addr:$src)))))]>, + Sched<[sched.XMM.Folded]>; +} + +// FIXME: Merge with above classes when there are patterns for the ymm version +multiclass sse3_replicate_dfp_y<string OpcodeStr, X86SchedWriteWidths sched> { +def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>, + Sched<[sched.YMM]>; +def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, + (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>, + Sched<[sched.YMM.Folded]>; +} + +let Predicates = [HasAVX, NoVLX] in { + defm VMOVDDUP : sse3_replicate_dfp<"vmovddup", SchedWriteFShuffle>, + VEX, VEX_WIG; + defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup", SchedWriteFShuffle>, + VEX, VEX_L, VEX_WIG; +} + +defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>; + + +let Predicates = [HasAVX, NoVLX] in { + def : Pat<(X86Movddup (loadv2f64 addr:$src)), + (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; +} + +let Predicates = [UseSSE3] in { + // No need for aligned memory as this only loads 64-bits. + def : Pat<(X86Movddup (loadv2f64 addr:$src)), + (MOVDDUPrm addr:$src)>; +} + +//===---------------------------------------------------------------------===// +// SSE3 - Move Unaligned Integer +//===---------------------------------------------------------------------===// + +let Predicates = [HasAVX] in { + def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "vlddqu\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, + Sched<[SchedWriteVecMoveLS.XMM.RM]>, VEX, VEX_WIG; + def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), + "vlddqu\t{$src, $dst|$dst, $src}", + [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>, + Sched<[SchedWriteVecMoveLS.YMM.RM]>, VEX, VEX_L, VEX_WIG; +} // Predicates + +def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "lddqu\t{$src, $dst|$dst, $src}", + [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, + Sched<[SchedWriteVecMoveLS.XMM.RM]>; + +//===---------------------------------------------------------------------===// +// SSE3 - Arithmetic +//===---------------------------------------------------------------------===// + +multiclass sse3_addsub<string OpcodeStr, ValueType vt, RegisterClass RC, + X86MemOperand x86memop, X86FoldableSchedWrite sched, + PatFrag ld_frag, bit Is2Addr = 1> { + def rr : I<0xD0, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (vt (X86Addsub RC:$src1, RC:$src2)))]>, + Sched<[sched]>; + def rm : I<0xD0, MRMSrcMem, + (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (vt (X86Addsub RC:$src1, (ld_frag addr:$src2))))]>, + Sched<[sched.Folded, ReadAfterLd]>; +} + +let Predicates = [HasAVX] in { + let ExeDomain = SSEPackedSingle in { + defm VADDSUBPS : sse3_addsub<"vaddsubps", v4f32, VR128, f128mem, + SchedWriteFAddSizes.PS.XMM, loadv4f32, 0>, + XD, VEX_4V, VEX_WIG; + defm VADDSUBPSY : sse3_addsub<"vaddsubps", v8f32, VR256, f256mem, + SchedWriteFAddSizes.PS.YMM, loadv8f32, 0>, + XD, VEX_4V, VEX_L, VEX_WIG; + } + let ExeDomain = SSEPackedDouble in { + defm VADDSUBPD : sse3_addsub<"vaddsubpd", v2f64, VR128, f128mem, + SchedWriteFAddSizes.PD.XMM, loadv2f64, 0>, + PD, VEX_4V, VEX_WIG; + defm VADDSUBPDY : sse3_addsub<"vaddsubpd", v4f64, VR256, f256mem, + SchedWriteFAddSizes.PD.YMM, loadv4f64, 0>, + PD, VEX_4V, VEX_L, VEX_WIG; + } +} +let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in { + let ExeDomain = SSEPackedSingle in + defm ADDSUBPS : sse3_addsub<"addsubps", v4f32, VR128, f128mem, + SchedWriteFAddSizes.PS.XMM, memopv4f32>, XD; + let ExeDomain = SSEPackedDouble in + defm ADDSUBPD : sse3_addsub<"addsubpd", v2f64, VR128, f128mem, + SchedWriteFAddSizes.PD.XMM, memopv2f64>, PD; +} + +//===---------------------------------------------------------------------===// +// SSE3 Instructions +//===---------------------------------------------------------------------===// + +// Horizontal ops +multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, + X86MemOperand x86memop, SDNode OpNode, + X86FoldableSchedWrite sched, PatFrag ld_frag, + bit Is2Addr = 1> { + def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>, + Sched<[sched]>; + + def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>, + Sched<[sched.Folded, ReadAfterLd]>; +} +multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, + X86MemOperand x86memop, SDNode OpNode, + X86FoldableSchedWrite sched, PatFrag ld_frag, + bit Is2Addr = 1> { + def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>, + Sched<[sched]>; + + def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>, + Sched<[sched.Folded, ReadAfterLd]>; +} + +let Predicates = [HasAVX] in { + let ExeDomain = SSEPackedSingle in { + defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem, + X86fhadd, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG; + defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem, + X86fhsub, WriteFHAdd, loadv4f32, 0>, VEX_4V, VEX_WIG; + defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem, + X86fhadd, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG; + defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem, + X86fhsub, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG; + } + let ExeDomain = SSEPackedDouble in { + defm VHADDPD : S3_Int<0x7C, "vhaddpd", v2f64, VR128, f128mem, + X86fhadd, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG; + defm VHSUBPD : S3_Int<0x7D, "vhsubpd", v2f64, VR128, f128mem, + X86fhsub, WriteFHAdd, loadv2f64, 0>, VEX_4V, VEX_WIG; + defm VHADDPDY : S3_Int<0x7C, "vhaddpd", v4f64, VR256, f256mem, + X86fhadd, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG; + defm VHSUBPDY : S3_Int<0x7D, "vhsubpd", v4f64, VR256, f256mem, + X86fhsub, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG; + } +} + +let Constraints = "$src1 = $dst" in { + let ExeDomain = SSEPackedSingle in { + defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd, + WriteFHAdd, memopv4f32>; + defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub, + WriteFHAdd, memopv4f32>; + } + let ExeDomain = SSEPackedDouble in { + defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd, + WriteFHAdd, memopv2f64>; + defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub, + WriteFHAdd, memopv2f64>; + } +} + +//===---------------------------------------------------------------------===// +// SSSE3 - Packed Absolute Instructions +//===---------------------------------------------------------------------===// + +/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. +multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt, + SDNode OpNode, X86SchedWriteWidths sched, PatFrag ld_frag> { + def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (vt (OpNode VR128:$src)))]>, + Sched<[sched.XMM]>; + + def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst), + (ins i128mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, + (vt (OpNode (bitconvert (ld_frag addr:$src)))))]>, + Sched<[sched.XMM.Folded]>; +} + +/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. +multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt, + SDNode OpNode, X86SchedWriteWidths sched> { + def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (vt (OpNode VR256:$src)))]>, + Sched<[sched.YMM]>; + + def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst), + (ins i256mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, + (vt (OpNode (bitconvert (loadv4i64 addr:$src)))))]>, + Sched<[sched.YMM.Folded]>; +} + +let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { + defm VPABSB : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SchedWriteVecALU, + loadv2i64>, VEX, VEX_WIG; + defm VPABSW : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SchedWriteVecALU, + loadv2i64>, VEX, VEX_WIG; +} +let Predicates = [HasAVX, NoVLX] in { + defm VPABSD : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SchedWriteVecALU, + loadv2i64>, VEX, VEX_WIG; +} +let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { + defm VPABSB : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SchedWriteVecALU>, + VEX, VEX_L, VEX_WIG; + defm VPABSW : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs, SchedWriteVecALU>, + VEX, VEX_L, VEX_WIG; +} +let Predicates = [HasAVX2, NoVLX] in { + defm VPABSD : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs, SchedWriteVecALU>, + VEX, VEX_L, VEX_WIG; +} + +defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SchedWriteVecALU, + memopv2i64>; +defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SchedWriteVecALU, + memopv2i64>; +defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SchedWriteVecALU, + memopv2i64>; + +//===---------------------------------------------------------------------===// +// SSSE3 - Packed Binary Operator Instructions +//===---------------------------------------------------------------------===// + +/// SS3I_binop_rm - Simple SSSE3 bin op +multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType DstVT, ValueType OpVT, RegisterClass RC, + PatFrag memop_frag, X86MemOperand x86memop, + X86FoldableSchedWrite sched, bit Is2Addr = 1> { + let isCommutable = 1 in + def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (DstVT (OpNode (OpVT RC:$src1), RC:$src2)))]>, + Sched<[sched]>; + def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, + (DstVT (OpNode (OpVT RC:$src1), + (bitconvert (memop_frag addr:$src2)))))]>, + Sched<[sched.Folded, ReadAfterLd]>; +} + +/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}. +multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr, + Intrinsic IntId128, X86FoldableSchedWrite sched, + PatFrag ld_frag, bit Is2Addr = 1> { + let isCommutable = 1 in + def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>, + Sched<[sched]>; + def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set VR128:$dst, + (IntId128 VR128:$src1, + (bitconvert (ld_frag addr:$src2))))]>, + Sched<[sched.Folded, ReadAfterLd]>; +} + +multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr, + Intrinsic IntId256, + X86FoldableSchedWrite sched> { + let isCommutable = 1 in + def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>, + Sched<[sched]>; + def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, i256mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (IntId256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2))))]>, + Sched<[sched.Folded, ReadAfterLd]>; +} + +let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in { +let isCommutable = 0 in { + defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8, + VR128, loadv2i64, i128mem, + SchedWriteVarShuffle.XMM, 0>, VEX_4V, VEX_WIG; + defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16, + v16i8, VR128, loadv2i64, i128mem, + SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG; +} +defm VPMULHRSW : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16, + VR128, loadv2i64, i128mem, + SchedWriteVecIMul.XMM, 0>, VEX_4V, VEX_WIG; +} + +let ImmT = NoImm, Predicates = [HasAVX] in { +let isCommutable = 0 in { + defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128, + loadv2i64, i128mem, + SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG; + defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128, + loadv2i64, i128mem, + SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG; + defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128, + loadv2i64, i128mem, + SchedWritePHAdd.XMM, 0>, VEX_4V, VEX_WIG; + defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128, + loadv2i64, i128mem, + SchedWritePHAdd.XMM, 0>, VEX_4V; + defm VPSIGNB : SS3I_binop_rm_int<0x08, "vpsignb", + int_x86_ssse3_psign_b_128, + SchedWriteVecALU.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG; + defm VPSIGNW : SS3I_binop_rm_int<0x09, "vpsignw", + int_x86_ssse3_psign_w_128, + SchedWriteVecALU.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG; + defm VPSIGND : SS3I_binop_rm_int<0x0A, "vpsignd", + int_x86_ssse3_psign_d_128, + SchedWriteVecALU.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG; + defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw", + int_x86_ssse3_phadd_sw_128, + SchedWritePHAdd.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG; + defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw", + int_x86_ssse3_phsub_sw_128, + SchedWritePHAdd.XMM, loadv2i64, 0>, VEX_4V, VEX_WIG; +} +} + +let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { +let isCommutable = 0 in { + defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8, + VR256, loadv4i64, i256mem, + SchedWriteVarShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; + defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16, + v32i8, VR256, loadv4i64, i256mem, + SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; +} +defm VPMULHRSWY : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16, + VR256, loadv4i64, i256mem, + SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; +} + +let ImmT = NoImm, Predicates = [HasAVX2] in { +let isCommutable = 0 in { + defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16, + VR256, loadv4i64, i256mem, + SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; + defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256, + loadv4i64, i256mem, + SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; + defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16, + VR256, loadv4i64, i256mem, + SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; + defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256, + loadv4i64, i256mem, + SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L; + defm VPSIGNB : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b, + SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG; + defm VPSIGNW : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w, + SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG; + defm VPSIGND : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d, + SchedWriteVecALU.YMM>, VEX_4V, VEX_L, VEX_WIG; + defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw", + int_x86_avx2_phadd_sw, + SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG; + defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw", + int_x86_avx2_phsub_sw, + SchedWritePHAdd.YMM>, VEX_4V, VEX_L, VEX_WIG; +} +} + +// None of these have i8 immediate fields. +let ImmT = NoImm, Constraints = "$src1 = $dst" in { +let isCommutable = 0 in { + defm PHADDW : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128, + memopv2i64, i128mem, SchedWritePHAdd.XMM>; + defm PHADDD : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128, + memopv2i64, i128mem, SchedWritePHAdd.XMM>; + defm PHSUBW : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128, + memopv2i64, i128mem, SchedWritePHAdd.XMM>; + defm PHSUBD : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128, + memopv2i64, i128mem, SchedWritePHAdd.XMM>; + defm PSIGNB : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128, + SchedWriteVecALU.XMM, memopv2i64>; + defm PSIGNW : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128, + SchedWriteVecALU.XMM, memopv2i64>; + defm PSIGND : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128, + SchedWriteVecALU.XMM, memopv2i64>; + defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128, + memopv2i64, i128mem, SchedWriteVarShuffle.XMM>; + defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw", + int_x86_ssse3_phadd_sw_128, + SchedWritePHAdd.XMM, memopv2i64>; + defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw", + int_x86_ssse3_phsub_sw_128, + SchedWritePHAdd.XMM, memopv2i64>; + defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16, + v16i8, VR128, memopv2i64, i128mem, + SchedWriteVecIMul.XMM>; +} +defm PMULHRSW : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16, + VR128, memopv2i64, i128mem, SchedWriteVecIMul.XMM>; +} + +//===---------------------------------------------------------------------===// +// SSSE3 - Packed Align Instruction Patterns +//===---------------------------------------------------------------------===// + +multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC, + PatFrag memop_frag, X86MemOperand x86memop, + X86FoldableSchedWrite sched, bit Is2Addr = 1> { + let hasSideEffects = 0 in { + def rri : SS3AI<0x0F, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, u8imm:$src3), + !if(Is2Addr, + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 imm:$src3))))]>, + Sched<[sched]>; + let mayLoad = 1 in + def rmi : SS3AI<0x0F, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2, u8imm:$src3), + !if(Is2Addr, + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set RC:$dst, (VT (X86PAlignr RC:$src1, + (bitconvert (memop_frag addr:$src2)), + (i8 imm:$src3))))]>, + Sched<[sched.Folded, ReadAfterLd]>; + } +} + +let Predicates = [HasAVX, NoVLX_Or_NoBWI] in + defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, loadv2i64, i128mem, + SchedWriteShuffle.XMM, 0>, VEX_4V, VEX_WIG; +let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in + defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, loadv4i64, i256mem, + SchedWriteShuffle.YMM, 0>, VEX_4V, VEX_L, VEX_WIG; +let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in + defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memopv2i64, i128mem, + SchedWriteShuffle.XMM>; + +//===---------------------------------------------------------------------===// +// SSSE3 - Thread synchronization +//===---------------------------------------------------------------------===// + +let SchedRW = [WriteSystem] in { +/* +let usesCustomInserter = 1 in { +def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3), + [(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>, + Requires<[HasSSE3]>; +} +*/ + +let Uses = [EAX, ECX, EDX] in +def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", []>, + TB, Requires<[HasSSE3]>; + +let Uses = [ECX, EAX] in +def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait", + [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>; +} // SchedRW + +def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>; +def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>; + +def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORrrr)>, + Requires<[Not64BitMode]>; +def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORrrr)>, + Requires<[In64BitMode]>; + +//===----------------------------------------------------------------------===// +// SSE4.1 - Packed Move with Sign/Zero Extend +//===----------------------------------------------------------------------===// + +multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp, + RegisterClass OutRC, RegisterClass InRC, + X86FoldableSchedWrite sched> { + def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>, + Sched<[sched]>; + + def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>, + Sched<[sched.Folded]>; +} + +multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr, + X86MemOperand MemOp, X86MemOperand MemYOp, + Predicate prd> { + defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128, + SchedWriteShuffle.XMM>; + let Predicates = [HasAVX, prd] in + defm V#NAME : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp, + VR128, VR128, SchedWriteShuffle.XMM>, + VEX, VEX_WIG; + let Predicates = [HasAVX2, prd] in + defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp, + VR256, VR128, WriteShuffle256>, + VEX, VEX_L, VEX_WIG; +} + +multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp, + X86MemOperand MemYOp, Predicate prd> { + defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr), + MemOp, MemYOp, prd>; + defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10), + !strconcat("pmovzx", OpcodeStr), + MemOp, MemYOp, prd>; +} + +defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem, NoVLX_Or_NoBWI>; +defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem, NoVLX>; +defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem, NoVLX>; + +defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem, NoVLX>; +defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>; + +defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>; + +// AVX2 Patterns +multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtOp> { + // Register-Register patterns + let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { + def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))), + (!cast<I>(OpcPrefix#BWYrr) VR128:$src)>; + } + let Predicates = [HasAVX, NoVLX] in { + def : Pat<(v8i32 (ExtOp (v16i8 VR128:$src))), + (!cast<I>(OpcPrefix#BDYrr) VR128:$src)>; + def : Pat<(v4i64 (ExtOp (v16i8 VR128:$src))), + (!cast<I>(OpcPrefix#BQYrr) VR128:$src)>; + + def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))), + (!cast<I>(OpcPrefix#WDYrr) VR128:$src)>; + def : Pat<(v4i64 (ExtOp (v8i16 VR128:$src))), + (!cast<I>(OpcPrefix#WQYrr) VR128:$src)>; + + def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))), + (!cast<I>(OpcPrefix#DQYrr) VR128:$src)>; + } + + // Simple Register-Memory patterns + let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { + def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), + (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; + } + let Predicates = [HasAVX, NoVLX] in { + def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), + (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; + def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), + (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; + + def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), + (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; + def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), + (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; + + def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)), + (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; + } + + // AVX2 Register-Memory patterns + let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { + def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; + def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; + def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWYrm) addr:$src)>; + } + let Predicates = [HasAVX, NoVLX] in { + def : Pat<(v8i32 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BDYrm) addr:$src)>; + + def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), + (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), + (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BQYrm) addr:$src)>; + + def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; + def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDYrm) addr:$src)>; + + def : Pat<(v4i64 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WQYrm) addr:$src)>; + + def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; + def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQYrm) addr:$src)>; + } +} + +defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", X86vsext>; +defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", X86vzext>; + +// SSE4.1/AVX patterns. +multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy, + SDNode ExtOp> { + let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { + def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))), + (!cast<I>(OpcPrefix#BWrr) VR128:$src)>; + } + let Predicates = [HasAVX, NoVLX] in { + def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))), + (!cast<I>(OpcPrefix#BDrr) VR128:$src)>; + def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))), + (!cast<I>(OpcPrefix#BQrr) VR128:$src)>; + + def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))), + (!cast<I>(OpcPrefix#WDrr) VR128:$src)>; + def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))), + (!cast<I>(OpcPrefix#WQrr) VR128:$src)>; + + def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))), + (!cast<I>(OpcPrefix#DQrr) VR128:$src)>; + } + let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { + def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), + (!cast<I>(OpcPrefix#BWrm) addr:$src)>; + } + let Predicates = [HasAVX, NoVLX] in { + def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), + (!cast<I>(OpcPrefix#BDrm) addr:$src)>; + def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)), + (!cast<I>(OpcPrefix#BQrm) addr:$src)>; + + def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), + (!cast<I>(OpcPrefix#WDrm) addr:$src)>; + def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)), + (!cast<I>(OpcPrefix#WQrm) addr:$src)>; + + def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)), + (!cast<I>(OpcPrefix#DQrm) addr:$src)>; + } + let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { + def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (!cast<I>(OpcPrefix#BWrm) addr:$src)>; + def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), + (!cast<I>(OpcPrefix#BWrm) addr:$src)>; + def : Pat<(v8i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWrm) addr:$src)>; + def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWrm) addr:$src)>; + def : Pat<(v8i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BWrm) addr:$src)>; + } + let Predicates = [HasAVX, NoVLX] in { + def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), + (!cast<I>(OpcPrefix#BDrm) addr:$src)>; + def : Pat<(v4i32 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), + (!cast<I>(OpcPrefix#BDrm) addr:$src)>; + def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BDrm) addr:$src)>; + def : Pat<(v4i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BDrm) addr:$src)>; + + def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))), + (!cast<I>(OpcPrefix#BQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))), + (!cast<I>(OpcPrefix#BQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#BQrm) addr:$src)>; + + def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (!cast<I>(OpcPrefix#WDrm) addr:$src)>; + def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), + (!cast<I>(OpcPrefix#WDrm) addr:$src)>; + def : Pat<(v4i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDrm) addr:$src)>; + def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDrm) addr:$src)>; + def : Pat<(v4i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WDrm) addr:$src)>; + + def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))), + (!cast<I>(OpcPrefix#WQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (v8i16 (vzmovl_v4i32 addr:$src)))), + (!cast<I>(OpcPrefix#WQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#WQrm) addr:$src)>; + + def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (!cast<I>(OpcPrefix#DQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))), + (!cast<I>(OpcPrefix#DQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQrm) addr:$src)>; + def : Pat<(v2i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))), + (!cast<I>(OpcPrefix#DQrm) addr:$src)>; + } +} + +defm : SS41I_pmovx_patterns<"VPMOVSX", "s", sext_invec>; +defm : SS41I_pmovx_patterns<"VPMOVZX", "z", zext_invec>; + +let Predicates = [UseSSE41] in { + defm : SS41I_pmovx_patterns<"PMOVSX", "s", sext_invec>; + defm : SS41I_pmovx_patterns<"PMOVZX", "z", zext_invec>; +} + +//===----------------------------------------------------------------------===// +// SSE4.1 - Extract Instructions +//===----------------------------------------------------------------------===// + +/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem +multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> { + def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), + (ins VR128:$src1, u8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1), + imm:$src2))]>, + Sched<[WriteVecExtract]>; + let hasSideEffects = 0, mayStore = 1 in + def mr : SS4AIi8<opc, MRMDestMem, (outs), + (ins i8mem:$dst, VR128:$src1, u8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))), + addr:$dst)]>, Sched<[WriteVecExtractSt]>; +} + +let Predicates = [HasAVX, NoBWI] in + defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX; + +defm PEXTRB : SS41I_extract8<0x14, "pextrb">; + + +/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination +multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> { + let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in + def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), + (ins VR128:$src1, u8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, + Sched<[WriteVecExtract]>, FoldGenData<NAME#rr>; + + let hasSideEffects = 0, mayStore = 1 in + def mr : SS4AIi8<opc, MRMDestMem, (outs), + (ins i16mem:$dst, VR128:$src1, u8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), imm:$src2))), + addr:$dst)]>, Sched<[WriteVecExtractSt]>; +} + +let Predicates = [HasAVX, NoBWI] in + defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX; + +defm PEXTRW : SS41I_extract16<0x15, "pextrw">; + + +/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination +multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> { + def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst), + (ins VR128:$src1, u8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set GR32:$dst, + (extractelt (v4i32 VR128:$src1), imm:$src2))]>, + Sched<[WriteVecExtract]>; + def mr : SS4AIi8<opc, MRMDestMem, (outs), + (ins i32mem:$dst, VR128:$src1, u8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(store (extractelt (v4i32 VR128:$src1), imm:$src2), + addr:$dst)]>, Sched<[WriteVecExtractSt]>; +} + +let Predicates = [HasAVX, NoDQI] in + defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX; + +defm PEXTRD : SS41I_extract32<0x16, "pextrd">; + +/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination +multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> { + def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst), + (ins VR128:$src1, u8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set GR64:$dst, + (extractelt (v2i64 VR128:$src1), imm:$src2))]>, + Sched<[WriteVecExtract]>; + def mr : SS4AIi8<opc, MRMDestMem, (outs), + (ins i64mem:$dst, VR128:$src1, u8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(store (extractelt (v2i64 VR128:$src1), imm:$src2), + addr:$dst)]>, Sched<[WriteVecExtractSt]>; +} + +let Predicates = [HasAVX, NoDQI] in + defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W; + +defm PEXTRQ : SS41I_extract64<0x16, "pextrq">, REX_W; + +/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory +/// destination +multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> { + def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst), + (ins VR128:$src1, u8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set GR32orGR64:$dst, + (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>, + Sched<[WriteVecExtract]>; + def mr : SS4AIi8<opc, MRMDestMem, (outs), + (ins f32mem:$dst, VR128:$src1, u8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2), + addr:$dst)]>, Sched<[WriteVecExtractSt]>; +} + +let ExeDomain = SSEPackedSingle in { + let Predicates = [UseAVX] in + defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX, VEX_WIG; + defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">; +} + +// Also match an EXTRACTPS store when the store is done as f32 instead of i32. +def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)), + imm:$src2))), + addr:$dst), + (VEXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>, + Requires<[HasAVX]>; +def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)), + imm:$src2))), + addr:$dst), + (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>, + Requires<[UseSSE41]>; + +//===----------------------------------------------------------------------===// +// SSE4.1 - Insert Instructions +//===----------------------------------------------------------------------===// + +multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> { + def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3), + !if(Is2Addr, + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, + (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>, + Sched<[WriteVecInsert]>; + def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i8mem:$src2, u8imm:$src3), + !if(Is2Addr, + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, + (X86pinsrb VR128:$src1, (extloadi8 addr:$src2), + imm:$src3))]>, Sched<[WriteVecInsertLd, ReadAfterLd]>; +} + +let Predicates = [HasAVX, NoBWI] in + defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V; +let Constraints = "$src1 = $dst" in + defm PINSRB : SS41I_insert8<0x20, "pinsrb">; + +multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> { + def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, GR32:$src2, u8imm:$src3), + !if(Is2Addr, + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, + (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>, + Sched<[WriteVecInsert]>; + def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i32mem:$src2, u8imm:$src3), + !if(Is2Addr, + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, + (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2), + imm:$src3)))]>, Sched<[WriteVecInsertLd, ReadAfterLd]>; +} + +let Predicates = [HasAVX, NoDQI] in + defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V; +let Constraints = "$src1 = $dst" in + defm PINSRD : SS41I_insert32<0x22, "pinsrd">; + +multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> { + def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, GR64:$src2, u8imm:$src3), + !if(Is2Addr, + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, + (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>, + Sched<[WriteVecInsert]>; + def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i64mem:$src2, u8imm:$src3), + !if(Is2Addr, + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, + (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2), + imm:$src3)))]>, Sched<[WriteVecInsertLd, ReadAfterLd]>; +} + +let Predicates = [HasAVX, NoDQI] in + defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W; +let Constraints = "$src1 = $dst" in + defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W; + +// insertps has a few different modes, there's the first two here below which +// are optimized inserts that won't zero arbitrary elements in the destination +// vector. The next one matches the intrinsic and could zero arbitrary elements +// in the target vector. +multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> { + def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, u8imm:$src3), + !if(Is2Addr, + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, + (X86insertps VR128:$src1, VR128:$src2, imm:$src3))]>, + Sched<[SchedWriteFShuffle.XMM]>; + def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, f32mem:$src2, u8imm:$src3), + !if(Is2Addr, + !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, + (X86insertps VR128:$src1, + (v4f32 (scalar_to_vector (loadf32 addr:$src2))), + imm:$src3))]>, + Sched<[SchedWriteFShuffle.XMM.Folded, ReadAfterLd]>; +} + +let ExeDomain = SSEPackedSingle in { + let Predicates = [UseAVX] in + defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, + VEX_4V, VEX_WIG; + let Constraints = "$src1 = $dst" in + defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>; +} + +let Predicates = [UseAVX] in { + // If we're inserting an element from a vbroadcast of a load, fold the + // load into the X86insertps instruction. + def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), + (X86VBroadcast (loadf32 addr:$src2)), imm:$src3)), + (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; + def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), + (X86VBroadcast (loadv4f32 addr:$src2)), imm:$src3)), + (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; +} + +//===----------------------------------------------------------------------===// +// SSE4.1 - Round Instructions +//===----------------------------------------------------------------------===// + +multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr, + X86MemOperand x86memop, RegisterClass RC, + ValueType VT, PatFrag mem_frag, SDNode OpNode, + X86FoldableSchedWrite sched> { + // Intrinsic operation, reg. + // Vector intrinsic operation, reg + def r : SS4AIi8<opc, MRMSrcReg, + (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (VT (OpNode RC:$src1, imm:$src2)))]>, + Sched<[sched]>; + + // Vector intrinsic operation, mem + def m : SS4AIi8<opc, MRMSrcMem, + (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, + (VT (OpNode (mem_frag addr:$src1),imm:$src2)))]>, + Sched<[sched.Folded]>; +} + +multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd, + string OpcodeStr, X86FoldableSchedWrite sched> { +let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in { + def SSr : SS4AIi8<opcss, MRMSrcReg, + (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3), + !strconcat(OpcodeStr, + "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, Sched<[sched]>; + + let mayLoad = 1 in + def SSm : SS4AIi8<opcss, MRMSrcMem, + (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3), + !strconcat(OpcodeStr, + "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, Sched<[sched.Folded, ReadAfterLd]>; +} // ExeDomain = SSEPackedSingle, hasSideEffects = 0 + +let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in { + def SDr : SS4AIi8<opcsd, MRMSrcReg, + (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3), + !strconcat(OpcodeStr, + "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, Sched<[sched]>; + + let mayLoad = 1 in + def SDm : SS4AIi8<opcsd, MRMSrcMem, + (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3), + !strconcat(OpcodeStr, + "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, Sched<[sched.Folded, ReadAfterLd]>; +} // ExeDomain = SSEPackedDouble, hasSideEffects = 0 +} + +multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd, + string OpcodeStr, X86FoldableSchedWrite sched> { +let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in { + def SSr : SS4AIi8<opcss, MRMSrcReg, + (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2), + !strconcat(OpcodeStr, + "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, Sched<[sched]>; + + let mayLoad = 1 in + def SSm : SS4AIi8<opcss, MRMSrcMem, + (outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2), + !strconcat(OpcodeStr, + "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, Sched<[sched.Folded, ReadAfterLd]>; +} // ExeDomain = SSEPackedSingle, hasSideEffects = 0 + +let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in { + def SDr : SS4AIi8<opcsd, MRMSrcReg, + (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2), + !strconcat(OpcodeStr, + "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, Sched<[sched]>; + + let mayLoad = 1 in + def SDm : SS4AIi8<opcsd, MRMSrcMem, + (outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2), + !strconcat(OpcodeStr, + "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, Sched<[sched.Folded, ReadAfterLd]>; +} // ExeDomain = SSEPackedDouble, hasSideEffects = 0 +} + +multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd, + string OpcodeStr, X86FoldableSchedWrite sched, + ValueType VT32, ValueType VT64, + SDNode OpNode, bit Is2Addr = 1> { +let ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 in { + def SSr_Int : SS4AIi8<opcss, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>, + Sched<[sched]>; + + def SSm_Int : SS4AIi8<opcss, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, + (OpNode VR128:$src1, sse_load_f32:$src2, imm:$src3))]>, + Sched<[sched.Folded, ReadAfterLd]>; +} // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 + +let ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 in { + def SDr_Int : SS4AIi8<opcsd, MRMSrcReg, + (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>, + Sched<[sched]>; + + def SDm_Int : SS4AIi8<opcsd, MRMSrcMem, + (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set VR128:$dst, + (OpNode VR128:$src1, sse_load_f64:$src2, imm:$src3))]>, + Sched<[sched.Folded, ReadAfterLd]>; +} // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 +} + +// FP round - roundss, roundps, roundsd, roundpd +let Predicates = [HasAVX, NoVLX] in { + let ExeDomain = SSEPackedSingle in { + // Intrinsic form + defm VROUNDPS : sse41_fp_unop_p<0x08, "vroundps", f128mem, VR128, v4f32, + loadv4f32, X86VRndScale, SchedWriteFRnd.XMM>, + VEX, VEX_WIG; + defm VROUNDPSY : sse41_fp_unop_p<0x08, "vroundps", f256mem, VR256, v8f32, + loadv8f32, X86VRndScale, SchedWriteFRnd.YMM>, + VEX, VEX_L, VEX_WIG; + } + + let ExeDomain = SSEPackedDouble in { + defm VROUNDPD : sse41_fp_unop_p<0x09, "vroundpd", f128mem, VR128, v2f64, + loadv2f64, X86VRndScale, SchedWriteFRnd.XMM>, + VEX, VEX_WIG; + defm VROUNDPDY : sse41_fp_unop_p<0x09, "vroundpd", f256mem, VR256, v4f64, + loadv4f64, X86VRndScale, SchedWriteFRnd.YMM>, + VEX, VEX_L, VEX_WIG; + } +} +let Predicates = [HasAVX, NoAVX512] in { + defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl, + v4f32, v2f64, X86RndScales, 0>, + VEX_4V, VEX_LIG, VEX_WIG; + defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>, + VEX_4V, VEX_LIG, VEX_WIG; +} + +let Predicates = [UseAVX] in { + def : Pat<(ffloor FR32:$src), + (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>; + def : Pat<(f32 (fnearbyint FR32:$src)), + (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>; + def : Pat<(f32 (fceil FR32:$src)), + (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>; + def : Pat<(f32 (frint FR32:$src)), + (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>; + def : Pat<(f32 (ftrunc FR32:$src)), + (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>; + + def : Pat<(f64 (ffloor FR64:$src)), + (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>; + def : Pat<(f64 (fnearbyint FR64:$src)), + (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>; + def : Pat<(f64 (fceil FR64:$src)), + (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>; + def : Pat<(f64 (frint FR64:$src)), + (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>; + def : Pat<(f64 (ftrunc FR64:$src)), + (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>; +} + +let Predicates = [UseAVX, OptForSize] in { + def : Pat<(ffloor (loadf32 addr:$src)), + (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0x9))>; + def : Pat<(f32 (fnearbyint (loadf32 addr:$src))), + (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0xC))>; + def : Pat<(f32 (fceil (loadf32 addr:$src))), + (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0xA))>; + def : Pat<(f32 (frint (loadf32 addr:$src))), + (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0x4))>; + def : Pat<(f32 (ftrunc (loadf32 addr:$src))), + (VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src, (i32 0xB))>; + + def : Pat<(f64 (ffloor (loadf64 addr:$src))), + (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0x9))>; + def : Pat<(f64 (fnearbyint (loadf64 addr:$src))), + (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0xC))>; + def : Pat<(f64 (fceil (loadf64 addr:$src))), + (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0xA))>; + def : Pat<(f64 (frint (loadf64 addr:$src))), + (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0x4))>; + def : Pat<(f64 (ftrunc (loadf64 addr:$src))), + (VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src, (i32 0xB))>; +} + +let Predicates = [HasAVX, NoVLX] in { + def : Pat<(v4f32 (ffloor VR128:$src)), + (VROUNDPSr VR128:$src, (i32 0x9))>; + def : Pat<(v4f32 (fnearbyint VR128:$src)), + (VROUNDPSr VR128:$src, (i32 0xC))>; + def : Pat<(v4f32 (fceil VR128:$src)), + (VROUNDPSr VR128:$src, (i32 0xA))>; + def : Pat<(v4f32 (frint VR128:$src)), + (VROUNDPSr VR128:$src, (i32 0x4))>; + def : Pat<(v4f32 (ftrunc VR128:$src)), + (VROUNDPSr VR128:$src, (i32 0xB))>; + + def : Pat<(v4f32 (ffloor (loadv4f32 addr:$src))), + (VROUNDPSm addr:$src, (i32 0x9))>; + def : Pat<(v4f32 (fnearbyint (loadv4f32 addr:$src))), + (VROUNDPSm addr:$src, (i32 0xC))>; + def : Pat<(v4f32 (fceil (loadv4f32 addr:$src))), + (VROUNDPSm addr:$src, (i32 0xA))>; + def : Pat<(v4f32 (frint (loadv4f32 addr:$src))), + (VROUNDPSm addr:$src, (i32 0x4))>; + def : Pat<(v4f32 (ftrunc (loadv4f32 addr:$src))), + (VROUNDPSm addr:$src, (i32 0xB))>; + + def : Pat<(v2f64 (ffloor VR128:$src)), + (VROUNDPDr VR128:$src, (i32 0x9))>; + def : Pat<(v2f64 (fnearbyint VR128:$src)), + (VROUNDPDr VR128:$src, (i32 0xC))>; + def : Pat<(v2f64 (fceil VR128:$src)), + (VROUNDPDr VR128:$src, (i32 0xA))>; + def : Pat<(v2f64 (frint VR128:$src)), + (VROUNDPDr VR128:$src, (i32 0x4))>; + def : Pat<(v2f64 (ftrunc VR128:$src)), + (VROUNDPDr VR128:$src, (i32 0xB))>; + + def : Pat<(v2f64 (ffloor (loadv2f64 addr:$src))), + (VROUNDPDm addr:$src, (i32 0x9))>; + def : Pat<(v2f64 (fnearbyint (loadv2f64 addr:$src))), + (VROUNDPDm addr:$src, (i32 0xC))>; + def : Pat<(v2f64 (fceil (loadv2f64 addr:$src))), + (VROUNDPDm addr:$src, (i32 0xA))>; + def : Pat<(v2f64 (frint (loadv2f64 addr:$src))), + (VROUNDPDm addr:$src, (i32 0x4))>; + def : Pat<(v2f64 (ftrunc (loadv2f64 addr:$src))), + (VROUNDPDm addr:$src, (i32 0xB))>; + + def : Pat<(v8f32 (ffloor VR256:$src)), + (VROUNDPSYr VR256:$src, (i32 0x9))>; + def : Pat<(v8f32 (fnearbyint VR256:$src)), + (VROUNDPSYr VR256:$src, (i32 0xC))>; + def : Pat<(v8f32 (fceil VR256:$src)), + (VROUNDPSYr VR256:$src, (i32 0xA))>; + def : Pat<(v8f32 (frint VR256:$src)), + (VROUNDPSYr VR256:$src, (i32 0x4))>; + def : Pat<(v8f32 (ftrunc VR256:$src)), + (VROUNDPSYr VR256:$src, (i32 0xB))>; + + def : Pat<(v8f32 (ffloor (loadv8f32 addr:$src))), + (VROUNDPSYm addr:$src, (i32 0x9))>; + def : Pat<(v8f32 (fnearbyint (loadv8f32 addr:$src))), + (VROUNDPSYm addr:$src, (i32 0xC))>; + def : Pat<(v8f32 (fceil (loadv8f32 addr:$src))), + (VROUNDPSYm addr:$src, (i32 0xA))>; + def : Pat<(v8f32 (frint (loadv8f32 addr:$src))), + (VROUNDPSYm addr:$src, (i32 0x4))>; + def : Pat<(v8f32 (ftrunc (loadv8f32 addr:$src))), + (VROUNDPSYm addr:$src, (i32 0xB))>; + + def : Pat<(v4f64 (ffloor VR256:$src)), + (VROUNDPDYr VR256:$src, (i32 0x9))>; + def : Pat<(v4f64 (fnearbyint VR256:$src)), + (VROUNDPDYr VR256:$src, (i32 0xC))>; + def : Pat<(v4f64 (fceil VR256:$src)), + (VROUNDPDYr VR256:$src, (i32 0xA))>; + def : Pat<(v4f64 (frint VR256:$src)), + (VROUNDPDYr VR256:$src, (i32 0x4))>; + def : Pat<(v4f64 (ftrunc VR256:$src)), + (VROUNDPDYr VR256:$src, (i32 0xB))>; + + def : Pat<(v4f64 (ffloor (loadv4f64 addr:$src))), + (VROUNDPDYm addr:$src, (i32 0x9))>; + def : Pat<(v4f64 (fnearbyint (loadv4f64 addr:$src))), + (VROUNDPDYm addr:$src, (i32 0xC))>; + def : Pat<(v4f64 (fceil (loadv4f64 addr:$src))), + (VROUNDPDYm addr:$src, (i32 0xA))>; + def : Pat<(v4f64 (frint (loadv4f64 addr:$src))), + (VROUNDPDYm addr:$src, (i32 0x4))>; + def : Pat<(v4f64 (ftrunc (loadv4f64 addr:$src))), + (VROUNDPDYm addr:$src, (i32 0xB))>; +} + +let ExeDomain = SSEPackedSingle in +defm ROUNDPS : sse41_fp_unop_p<0x08, "roundps", f128mem, VR128, v4f32, + memopv4f32, X86VRndScale, SchedWriteFRnd.XMM>; +let ExeDomain = SSEPackedDouble in +defm ROUNDPD : sse41_fp_unop_p<0x09, "roundpd", f128mem, VR128, v2f64, + memopv2f64, X86VRndScale, SchedWriteFRnd.XMM>; + +defm ROUND : sse41_fp_unop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl>; + +let Constraints = "$src1 = $dst" in +defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl, + v4f32, v2f64, X86RndScales>; + +let Predicates = [UseSSE41] in { + def : Pat<(ffloor FR32:$src), + (ROUNDSSr FR32:$src, (i32 0x9))>; + def : Pat<(f32 (fnearbyint FR32:$src)), + (ROUNDSSr FR32:$src, (i32 0xC))>; + def : Pat<(f32 (fceil FR32:$src)), + (ROUNDSSr FR32:$src, (i32 0xA))>; + def : Pat<(f32 (frint FR32:$src)), + (ROUNDSSr FR32:$src, (i32 0x4))>; + def : Pat<(f32 (ftrunc FR32:$src)), + (ROUNDSSr FR32:$src, (i32 0xB))>; + + def : Pat<(f64 (ffloor FR64:$src)), + (ROUNDSDr FR64:$src, (i32 0x9))>; + def : Pat<(f64 (fnearbyint FR64:$src)), + (ROUNDSDr FR64:$src, (i32 0xC))>; + def : Pat<(f64 (fceil FR64:$src)), + (ROUNDSDr FR64:$src, (i32 0xA))>; + def : Pat<(f64 (frint FR64:$src)), + (ROUNDSDr FR64:$src, (i32 0x4))>; + def : Pat<(f64 (ftrunc FR64:$src)), + (ROUNDSDr FR64:$src, (i32 0xB))>; +} + +let Predicates = [UseSSE41, OptForSize] in { + def : Pat<(ffloor (loadf32 addr:$src)), + (ROUNDSSm addr:$src, (i32 0x9))>; + def : Pat<(f32 (fnearbyint (loadf32 addr:$src))), + (ROUNDSSm addr:$src, (i32 0xC))>; + def : Pat<(f32 (fceil (loadf32 addr:$src))), + (ROUNDSSm addr:$src, (i32 0xA))>; + def : Pat<(f32 (frint (loadf32 addr:$src))), + (ROUNDSSm addr:$src, (i32 0x4))>; + def : Pat<(f32 (ftrunc (loadf32 addr:$src))), + (ROUNDSSm addr:$src, (i32 0xB))>; + + def : Pat<(f64 (ffloor (loadf64 addr:$src))), + (ROUNDSDm addr:$src, (i32 0x9))>; + def : Pat<(f64 (fnearbyint (loadf64 addr:$src))), + (ROUNDSDm addr:$src, (i32 0xC))>; + def : Pat<(f64 (fceil (loadf64 addr:$src))), + (ROUNDSDm addr:$src, (i32 0xA))>; + def : Pat<(f64 (frint (loadf64 addr:$src))), + (ROUNDSDm addr:$src, (i32 0x4))>; + def : Pat<(f64 (ftrunc (loadf64 addr:$src))), + (ROUNDSDm addr:$src, (i32 0xB))>; +} + +let Predicates = [UseSSE41] in { + def : Pat<(v4f32 (ffloor VR128:$src)), + (ROUNDPSr VR128:$src, (i32 0x9))>; + def : Pat<(v4f32 (fnearbyint VR128:$src)), + (ROUNDPSr VR128:$src, (i32 0xC))>; + def : Pat<(v4f32 (fceil VR128:$src)), + (ROUNDPSr VR128:$src, (i32 0xA))>; + def : Pat<(v4f32 (frint VR128:$src)), + (ROUNDPSr VR128:$src, (i32 0x4))>; + def : Pat<(v4f32 (ftrunc VR128:$src)), + (ROUNDPSr VR128:$src, (i32 0xB))>; + + def : Pat<(v4f32 (ffloor (memopv4f32 addr:$src))), + (ROUNDPSm addr:$src, (i32 0x9))>; + def : Pat<(v4f32 (fnearbyint (memopv4f32 addr:$src))), + (ROUNDPSm addr:$src, (i32 0xC))>; + def : Pat<(v4f32 (fceil (memopv4f32 addr:$src))), + (ROUNDPSm addr:$src, (i32 0xA))>; + def : Pat<(v4f32 (frint (memopv4f32 addr:$src))), + (ROUNDPSm addr:$src, (i32 0x4))>; + def : Pat<(v4f32 (ftrunc (memopv4f32 addr:$src))), + (ROUNDPSm addr:$src, (i32 0xB))>; + + def : Pat<(v2f64 (ffloor VR128:$src)), + (ROUNDPDr VR128:$src, (i32 0x9))>; + def : Pat<(v2f64 (fnearbyint VR128:$src)), + (ROUNDPDr VR128:$src, (i32 0xC))>; + def : Pat<(v2f64 (fceil VR128:$src)), + (ROUNDPDr VR128:$src, (i32 0xA))>; + def : Pat<(v2f64 (frint VR128:$src)), + (ROUNDPDr VR128:$src, (i32 0x4))>; + def : Pat<(v2f64 (ftrunc VR128:$src)), + (ROUNDPDr VR128:$src, (i32 0xB))>; + + def : Pat<(v2f64 (ffloor (memopv2f64 addr:$src))), + (ROUNDPDm addr:$src, (i32 0x9))>; + def : Pat<(v2f64 (fnearbyint (memopv2f64 addr:$src))), + (ROUNDPDm addr:$src, (i32 0xC))>; + def : Pat<(v2f64 (fceil (memopv2f64 addr:$src))), + (ROUNDPDm addr:$src, (i32 0xA))>; + def : Pat<(v2f64 (frint (memopv2f64 addr:$src))), + (ROUNDPDm addr:$src, (i32 0x4))>; + def : Pat<(v2f64 (ftrunc (memopv2f64 addr:$src))), + (ROUNDPDm addr:$src, (i32 0xB))>; +} + +defm : scalar_unary_math_imm_patterns<ffloor, "ROUNDSS", X86Movss, + v4f32, 0x01, UseSSE41>; +defm : scalar_unary_math_imm_patterns<fceil, "ROUNDSS", X86Movss, + v4f32, 0x02, UseSSE41>; +defm : scalar_unary_math_imm_patterns<ffloor, "ROUNDSD", X86Movsd, + v2f64, 0x01, UseSSE41>; +defm : scalar_unary_math_imm_patterns<fceil, "ROUNDSD", X86Movsd, + v2f64, 0x02, UseSSE41>; + +//===----------------------------------------------------------------------===// +// SSE4.1 - Packed Bit Test +//===----------------------------------------------------------------------===// + +// ptest instruction we'll lower to this in X86ISelLowering primarily from +// the intel intrinsic that corresponds to this. +let Defs = [EFLAGS], Predicates = [HasAVX] in { +def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), + "vptest\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>, + Sched<[SchedWriteVecTest.XMM]>, VEX, VEX_WIG; +def VPTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), + "vptest\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>, + Sched<[SchedWriteVecTest.XMM.Folded, ReadAfterLd]>, + VEX, VEX_WIG; + +def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2), + "vptest\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>, + Sched<[SchedWriteVecTest.YMM]>, VEX, VEX_L, VEX_WIG; +def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2), + "vptest\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>, + Sched<[SchedWriteVecTest.YMM.Folded, ReadAfterLd]>, + VEX, VEX_L, VEX_WIG; +} + +let Defs = [EFLAGS] in { +def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), + "ptest\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>, + Sched<[SchedWriteVecTest.XMM]>; +def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2), + "ptest\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>, + Sched<[SchedWriteVecTest.XMM.Folded, ReadAfterLd]>; +} + +// The bit test instructions below are AVX only +multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC, + X86MemOperand x86memop, PatFrag mem_frag, ValueType vt, + X86FoldableSchedWrite sched> { + def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), + [(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>, + Sched<[sched]>, VEX; + def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), + [(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>, + Sched<[sched.Folded, ReadAfterLd]>, VEX; +} + +let Defs = [EFLAGS], Predicates = [HasAVX] in { +let ExeDomain = SSEPackedSingle in { +defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32, + SchedWriteFTest.XMM>; +defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32, + SchedWriteFTest.YMM>, VEX_L; +} +let ExeDomain = SSEPackedDouble in { +defm VTESTPD : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64, + SchedWriteFTest.XMM>; +defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64, + SchedWriteFTest.YMM>, VEX_L; +} +} + +//===----------------------------------------------------------------------===// +// SSE4.1 - Misc Instructions +//===----------------------------------------------------------------------===// + +let Defs = [EFLAGS], Predicates = [HasPOPCNT] in { + def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), + "popcnt{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>, + Sched<[WritePOPCNT]>, OpSize16, XS; + def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "popcnt{w}\t{$src, $dst|$dst, $src}", + [(set GR16:$dst, (ctpop (loadi16 addr:$src))), + (implicit EFLAGS)]>, + Sched<[WritePOPCNT.Folded]>, OpSize16, XS; + + def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "popcnt{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>, + Sched<[WritePOPCNT]>, OpSize32, XS; + + def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "popcnt{l}\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (ctpop (loadi32 addr:$src))), + (implicit EFLAGS)]>, + Sched<[WritePOPCNT.Folded]>, OpSize32, XS; + + def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), + "popcnt{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>, + Sched<[WritePOPCNT]>, XS; + def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "popcnt{q}\t{$src, $dst|$dst, $src}", + [(set GR64:$dst, (ctpop (loadi64 addr:$src))), + (implicit EFLAGS)]>, + Sched<[WritePOPCNT.Folded]>, XS; +} + +// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16. +multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr, + SDNode OpNode, PatFrag ld_frag, + X86FoldableSchedWrite Sched> { + def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (v8i16 (OpNode (v8i16 VR128:$src))))]>, + Sched<[Sched]>; + def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), + (ins i128mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, + (v8i16 (OpNode (v8i16 (bitconvert (ld_frag addr:$src))))))]>, + Sched<[Sched.Folded]>; +} + +// PHMIN has the same profile as PSAD, thus we use the same scheduling +// model, although the naming is misleading. +let Predicates = [HasAVX] in +defm VPHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "vphminposuw", + X86phminpos, loadv2i64, + WritePHMINPOS>, VEX, VEX_WIG; +defm PHMINPOSUW : SS41I_unop_rm_int_v16<0x41, "phminposuw", + X86phminpos, memopv2i64, + WritePHMINPOS>; + +/// SS48I_binop_rm - Simple SSE41 binary operator. +multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType OpVT, RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop, X86FoldableSchedWrite sched, + bit Is2Addr = 1> { + let isCommutable = 1 in + def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, + Sched<[sched]>; + def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, + (OpVT (OpNode RC:$src1, (bitconvert (memop_frag addr:$src2)))))]>, + Sched<[sched.Folded, ReadAfterLd]>; +} + +let Predicates = [HasAVX, NoVLX] in { + defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128, + loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, + VEX_4V, VEX_WIG; + defm VPMINUD : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128, + loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, + VEX_4V, VEX_WIG; + defm VPMAXSD : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128, + loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, + VEX_4V, VEX_WIG; + defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128, + loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, + VEX_4V, VEX_WIG; + defm VPMULDQ : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128, + loadv2i64, i128mem, SchedWriteVecIMul.XMM, 0>, + VEX_4V, VEX_WIG; +} +let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { + defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128, + loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, + VEX_4V, VEX_WIG; + defm VPMINUW : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128, + loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, + VEX_4V, VEX_WIG; + defm VPMAXSB : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128, + loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, + VEX_4V, VEX_WIG; + defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128, + loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, + VEX_4V, VEX_WIG; +} + +let Predicates = [HasAVX2, NoVLX] in { + defm VPMINSDY : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256, + loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, + VEX_4V, VEX_L, VEX_WIG; + defm VPMINUDY : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256, + loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, + VEX_4V, VEX_L, VEX_WIG; + defm VPMAXSDY : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256, + loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, + VEX_4V, VEX_L, VEX_WIG; + defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256, + loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, + VEX_4V, VEX_L, VEX_WIG; + defm VPMULDQY : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256, + loadv4i64, i256mem, SchedWriteVecIMul.YMM, 0>, + VEX_4V, VEX_L, VEX_WIG; +} +let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { + defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256, + loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, + VEX_4V, VEX_L, VEX_WIG; + defm VPMINUWY : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256, + loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, + VEX_4V, VEX_L, VEX_WIG; + defm VPMAXSBY : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256, + loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, + VEX_4V, VEX_L, VEX_WIG; + defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256, + loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, + VEX_4V, VEX_L, VEX_WIG; +} + +let Constraints = "$src1 = $dst" in { + defm PMINSB : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128, + memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; + defm PMINSD : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128, + memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; + defm PMINUD : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128, + memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; + defm PMINUW : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128, + memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; + defm PMAXSB : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128, + memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; + defm PMAXSD : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128, + memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; + defm PMAXUD : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128, + memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; + defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128, + memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; + defm PMULDQ : SS48I_binop_rm<0x28, "pmuldq", X86pmuldq, v2i64, VR128, + memopv2i64, i128mem, SchedWriteVecIMul.XMM, 1>; +} + +let Predicates = [HasAVX, NoVLX] in + defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128, + loadv2i64, i128mem, SchedWritePMULLD.XMM, 0>, + VEX_4V, VEX_WIG; +let Predicates = [HasAVX] in + defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128, + loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, + VEX_4V, VEX_WIG; + +let Predicates = [HasAVX2, NoVLX] in + defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256, + loadv4i64, i256mem, SchedWritePMULLD.YMM, 0>, + VEX_4V, VEX_L, VEX_WIG; +let Predicates = [HasAVX2] in + defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256, + loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, + VEX_4V, VEX_L, VEX_WIG; + +let Constraints = "$src1 = $dst" in { + defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128, + memopv2i64, i128mem, SchedWritePMULLD.XMM, 1>; + defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128, + memopv2i64, i128mem, SchedWriteVecALU.XMM, 1>; +} + +/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate +multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, + Intrinsic IntId, RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop, bit Is2Addr, + X86FoldableSchedWrite sched> { + let isCommutable = 1 in + def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, u8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>, + Sched<[sched]>; + def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2, u8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set RC:$dst, + (IntId RC:$src1, + (bitconvert (memop_frag addr:$src2)), imm:$src3))]>, + Sched<[sched.Folded, ReadAfterLd]>; +} + +/// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate +multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType OpVT, RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop, bit Is2Addr, + X86FoldableSchedWrite sched> { + let isCommutable = 1 in + def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, u8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>, + Sched<[sched]>; + def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2, u8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set RC:$dst, + (OpVT (OpNode RC:$src1, + (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>, + Sched<[sched.Folded, ReadAfterLd]>; +} + +def BlendCommuteImm2 : SDNodeXForm<imm, [{ + uint8_t Imm = N->getZExtValue() & 0x03; + return getI8Imm(Imm ^ 0x03, SDLoc(N)); +}]>; + +def BlendCommuteImm4 : SDNodeXForm<imm, [{ + uint8_t Imm = N->getZExtValue() & 0x0f; + return getI8Imm(Imm ^ 0x0f, SDLoc(N)); +}]>; + +def BlendCommuteImm8 : SDNodeXForm<imm, [{ + uint8_t Imm = N->getZExtValue() & 0xff; + return getI8Imm(Imm ^ 0xff, SDLoc(N)); +}]>; + +let Predicates = [HasAVX] in { + let isCommutable = 0 in { + defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw, + VR128, loadv2i64, i128mem, 0, + SchedWriteMPSAD.XMM>, VEX_4V, VEX_WIG; + } + + let ExeDomain = SSEPackedSingle in + defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps, + VR128, loadv4f32, f128mem, 0, + SchedWriteDPPS.XMM>, VEX_4V, VEX_WIG; + let ExeDomain = SSEPackedDouble in + defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd, + VR128, loadv2f64, f128mem, 0, + SchedWriteDPPD.XMM>, VEX_4V, VEX_WIG; + let ExeDomain = SSEPackedSingle in + defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256, + VR256, loadv8f32, i256mem, 0, + SchedWriteDPPS.YMM>, VEX_4V, VEX_L, VEX_WIG; +} + +let Predicates = [HasAVX2] in { + let isCommutable = 0 in { + defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw, + VR256, loadv4i64, i256mem, 0, + SchedWriteMPSAD.YMM>, VEX_4V, VEX_L, VEX_WIG; + } +} + +let Constraints = "$src1 = $dst" in { + let isCommutable = 0 in { + defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw, + VR128, memopv2i64, i128mem, 1, + SchedWriteMPSAD.XMM>; + } + + let ExeDomain = SSEPackedSingle in + defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps, + VR128, memopv4f32, f128mem, 1, + SchedWriteDPPS.XMM>; + let ExeDomain = SSEPackedDouble in + defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd, + VR128, memopv2f64, f128mem, 1, + SchedWriteDPPD.XMM>; +} + +/// SS41I_blend_rmi - SSE 4.1 blend with 8-bit immediate +multiclass SS41I_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType OpVT, RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop, bit Is2Addr, Domain d, + X86FoldableSchedWrite sched, SDNodeXForm commuteXForm> { +let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in { + let isCommutable = 1 in + def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, u8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>, + Sched<[sched]>; + def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2, u8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set RC:$dst, + (OpVT (OpNode RC:$src1, + (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>, + Sched<[sched.Folded, ReadAfterLd]>; +} + + // Pattern to commute if load is in first source. + def : Pat<(OpVT (OpNode (bitconvert (memop_frag addr:$src2)), + RC:$src1, imm:$src3)), + (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2, + (commuteXForm imm:$src3))>; +} + +let Predicates = [HasAVX] in { + defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32, + VR128, loadv4f32, f128mem, 0, SSEPackedSingle, + SchedWriteFBlend.XMM, BlendCommuteImm4>, + VEX_4V, VEX_WIG; + defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32, + VR256, loadv8f32, f256mem, 0, SSEPackedSingle, + SchedWriteFBlend.YMM, BlendCommuteImm8>, + VEX_4V, VEX_L, VEX_WIG; + defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64, + VR128, loadv2f64, f128mem, 0, SSEPackedDouble, + SchedWriteFBlend.XMM, BlendCommuteImm2>, + VEX_4V, VEX_WIG; + defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64, + VR256, loadv4f64, f256mem, 0, SSEPackedDouble, + SchedWriteFBlend.YMM, BlendCommuteImm4>, + VEX_4V, VEX_L, VEX_WIG; + defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16, + VR128, loadv2i64, i128mem, 0, SSEPackedInt, + SchedWriteBlend.XMM, BlendCommuteImm8>, + VEX_4V, VEX_WIG; +} + +let Predicates = [HasAVX2] in { + defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16, + VR256, loadv4i64, i256mem, 0, SSEPackedInt, + SchedWriteBlend.YMM, BlendCommuteImm8>, + VEX_4V, VEX_L, VEX_WIG; +} + +defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32, + VR128, memopv4f32, f128mem, 1, SSEPackedSingle, + SchedWriteFBlend.XMM, BlendCommuteImm4>; +defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64, + VR128, memopv2f64, f128mem, 1, SSEPackedDouble, + SchedWriteFBlend.XMM, BlendCommuteImm2>; +defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16, + VR128, memopv2i64, i128mem, 1, SSEPackedInt, + SchedWriteBlend.XMM, BlendCommuteImm8>; + +// For insertion into the zero index (low half) of a 256-bit vector, it is +// more efficient to generate a blend with immediate instead of an insert*128. +let Predicates = [HasAVX] in { +def : Pat<(insert_subvector (v4f64 VR256:$src1), (v2f64 VR128:$src2), (iPTR 0)), + (VBLENDPDYrri VR256:$src1, + (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), + VR128:$src2, sub_xmm), 0x3)>; +def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)), + (VBLENDPSYrri VR256:$src1, + (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), + VR128:$src2, sub_xmm), 0xf)>; +} + +/// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators +multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr, + RegisterClass RC, X86MemOperand x86memop, + PatFrag mem_frag, Intrinsic IntId, + X86FoldableSchedWrite sched> { + def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))], + SSEPackedInt>, TAPD, VEX_4V, + Sched<[sched]>; + + def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2, RC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set RC:$dst, + (IntId RC:$src1, (bitconvert (mem_frag addr:$src2)), + RC:$src3))], SSEPackedInt>, TAPD, VEX_4V, + Sched<[sched.Folded, ReadAfterLd, + // x86memop:$src2 + ReadDefault, ReadDefault, ReadDefault, ReadDefault, + ReadDefault, + // RC::$src3 + ReadAfterLd]>; +} + +let Predicates = [HasAVX] in { +let ExeDomain = SSEPackedDouble in { +defm VBLENDVPD : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem, + loadv2f64, int_x86_sse41_blendvpd, + SchedWriteFVarBlend.XMM>; +defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem, + loadv4f64, int_x86_avx_blendv_pd_256, + SchedWriteFVarBlend.YMM>, VEX_L; +} // ExeDomain = SSEPackedDouble +let ExeDomain = SSEPackedSingle in { +defm VBLENDVPS : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem, + loadv4f32, int_x86_sse41_blendvps, + SchedWriteFVarBlend.XMM>; +defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem, + loadv8f32, int_x86_avx_blendv_ps_256, + SchedWriteFVarBlend.YMM>, VEX_L; +} // ExeDomain = SSEPackedSingle +defm VPBLENDVB : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem, + loadv2i64, int_x86_sse41_pblendvb, + SchedWriteVarBlend.XMM>; +} + +let Predicates = [HasAVX2] in { +defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem, + loadv4i64, int_x86_avx2_pblendvb, + SchedWriteVarBlend.YMM>, VEX_L; +} + +let Predicates = [HasAVX] in { + def : Pat<(v16i8 (vselect (v16i8 VR128:$mask), (v16i8 VR128:$src1), + (v16i8 VR128:$src2))), + (VPBLENDVBrr VR128:$src2, VR128:$src1, VR128:$mask)>; + def : Pat<(v4i32 (vselect (v4i32 VR128:$mask), (v4i32 VR128:$src1), + (v4i32 VR128:$src2))), + (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>; + def : Pat<(v4f32 (vselect (v4i32 VR128:$mask), (v4f32 VR128:$src1), + (v4f32 VR128:$src2))), + (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>; + def : Pat<(v2i64 (vselect (v2i64 VR128:$mask), (v2i64 VR128:$src1), + (v2i64 VR128:$src2))), + (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>; + def : Pat<(v2f64 (vselect (v2i64 VR128:$mask), (v2f64 VR128:$src1), + (v2f64 VR128:$src2))), + (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>; + def : Pat<(v8i32 (vselect (v8i32 VR256:$mask), (v8i32 VR256:$src1), + (v8i32 VR256:$src2))), + (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>; + def : Pat<(v8f32 (vselect (v8i32 VR256:$mask), (v8f32 VR256:$src1), + (v8f32 VR256:$src2))), + (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>; + def : Pat<(v4i64 (vselect (v4i64 VR256:$mask), (v4i64 VR256:$src1), + (v4i64 VR256:$src2))), + (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>; + def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1), + (v4f64 VR256:$src2))), + (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>; +} + +let Predicates = [HasAVX2] in { + def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1), + (v32i8 VR256:$src2))), + (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>; +} + +// Prefer a movss or movsd over a blendps when optimizing for size. these were +// changed to use blends because blends have better throughput on sandybridge +// and haswell, but movs[s/d] are 1-2 byte shorter instructions. +let Predicates = [HasAVX, OptForSpeed] in { + def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), + (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; + def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), + (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>; + + def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), + (VBLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>; + def : Pat<(v4f32 (X86Movss VR128:$src1, (loadv4f32 addr:$src2))), + (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>; + def : Pat<(v4f32 (X86Movss (loadv4f32 addr:$src2), VR128:$src1)), + (VBLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>; + + def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), + (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>; + def : Pat<(v2f64 (X86Movsd VR128:$src1, (loadv2f64 addr:$src2))), + (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>; + def : Pat<(v2f64 (X86Movsd (loadv2f64 addr:$src2), VR128:$src1)), + (VBLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>; + + // Move low f32 and clear high bits. + def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), + (SUBREG_TO_REG (i32 0), + (v4f32 (VBLENDPSrri (v4f32 (V_SET0)), + (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)), + (i8 1))), sub_xmm)>; + def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))), + (SUBREG_TO_REG (i32 0), + (v4i32 (VPBLENDWrri (v4i32 (V_SET0)), + (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)), + (i8 3))), sub_xmm)>; + + def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), + (SUBREG_TO_REG (i32 0), + (v2f64 (VBLENDPDrri (v2f64 (V_SET0)), + (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)), + (i8 1))), sub_xmm)>; + def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), + (SUBREG_TO_REG (i32 0), + (v2i64 (VPBLENDWrri (v2i64 (V_SET0)), + (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)), + (i8 0xf))), sub_xmm)>; +} + +// Prefer a movss or movsd over a blendps when optimizing for size. these were +// changed to use blends because blends have better throughput on sandybridge +// and haswell, but movs[s/d] are 1-2 byte shorter instructions. +let Predicates = [UseSSE41, OptForSpeed] in { + // With SSE41 we can use blends for these patterns. + def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), + (BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; + def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), + (PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>; + + def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), + (BLENDPSrri VR128:$src1, VR128:$src2, (i8 1))>; + def : Pat<(v4f32 (X86Movss VR128:$src1, (memopv4f32 addr:$src2))), + (BLENDPSrmi VR128:$src1, addr:$src2, (i8 1))>; + def : Pat<(v4f32 (X86Movss (memopv4f32 addr:$src2), VR128:$src1)), + (BLENDPSrmi VR128:$src1, addr:$src2, (i8 0xe))>; + + def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), + (BLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>; + def : Pat<(v2f64 (X86Movsd VR128:$src1, (memopv2f64 addr:$src2))), + (BLENDPDrmi VR128:$src1, addr:$src2, (i8 1))>; + def : Pat<(v2f64 (X86Movsd (memopv2f64 addr:$src2), VR128:$src1)), + (BLENDPDrmi VR128:$src1, addr:$src2, (i8 2))>; +} + + +/// SS41I_ternary_int - SSE 4.1 ternary operator +let Uses = [XMM0], Constraints = "$src1 = $dst" in { + multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag, + X86MemOperand x86memop, Intrinsic IntId, + X86FoldableSchedWrite sched> { + def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, + "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), + [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))]>, + Sched<[sched]>; + + def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, x86memop:$src2), + !strconcat(OpcodeStr, + "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), + [(set VR128:$dst, + (IntId VR128:$src1, + (bitconvert (mem_frag addr:$src2)), XMM0))]>, + Sched<[sched.Folded, ReadAfterLd]>; + } +} + +let ExeDomain = SSEPackedDouble in +defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, f128mem, + int_x86_sse41_blendvpd, SchedWriteFVarBlend.XMM>; +let ExeDomain = SSEPackedSingle in +defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, f128mem, + int_x86_sse41_blendvps, SchedWriteFVarBlend.XMM>; +defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem, + int_x86_sse41_pblendvb, SchedWriteVarBlend.XMM>; + +// Aliases with the implicit xmm0 argument +def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}", + (BLENDVPDrr0 VR128:$dst, VR128:$src2), 0>; +def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}", + (BLENDVPDrm0 VR128:$dst, f128mem:$src2), 0>; +def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}", + (BLENDVPSrr0 VR128:$dst, VR128:$src2), 0>; +def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}", + (BLENDVPSrm0 VR128:$dst, f128mem:$src2), 0>; +def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}", + (PBLENDVBrr0 VR128:$dst, VR128:$src2), 0>; +def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}", + (PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>; + +let Predicates = [UseSSE41] in { + def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1), + (v16i8 VR128:$src2))), + (PBLENDVBrr0 VR128:$src2, VR128:$src1)>; + def : Pat<(v4i32 (vselect (v4i32 XMM0), (v4i32 VR128:$src1), + (v4i32 VR128:$src2))), + (BLENDVPSrr0 VR128:$src2, VR128:$src1)>; + def : Pat<(v4f32 (vselect (v4i32 XMM0), (v4f32 VR128:$src1), + (v4f32 VR128:$src2))), + (BLENDVPSrr0 VR128:$src2, VR128:$src1)>; + def : Pat<(v2i64 (vselect (v2i64 XMM0), (v2i64 VR128:$src1), + (v2i64 VR128:$src2))), + (BLENDVPDrr0 VR128:$src2, VR128:$src1)>; + def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1), + (v2f64 VR128:$src2))), + (BLENDVPDrr0 VR128:$src2, VR128:$src1)>; +} + +let AddedComplexity = 400 in { // Prefer non-temporal versions + +let Predicates = [HasAVX, NoVLX] in +def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "vmovntdqa\t{$src, $dst|$dst, $src}", []>, + Sched<[SchedWriteVecMoveLSNT.XMM.RM]>, VEX, VEX_WIG; +let Predicates = [HasAVX2, NoVLX] in +def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), + "vmovntdqa\t{$src, $dst|$dst, $src}", []>, + Sched<[SchedWriteVecMoveLSNT.YMM.RM]>, VEX, VEX_L, VEX_WIG; +def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "movntdqa\t{$src, $dst|$dst, $src}", []>, + Sched<[SchedWriteVecMoveLSNT.XMM.RM]>; + +let Predicates = [HasAVX2, NoVLX] in { + def : Pat<(v8f32 (alignednontemporalload addr:$src)), + (VMOVNTDQAYrm addr:$src)>; + def : Pat<(v4f64 (alignednontemporalload addr:$src)), + (VMOVNTDQAYrm addr:$src)>; + def : Pat<(v4i64 (alignednontemporalload addr:$src)), + (VMOVNTDQAYrm addr:$src)>; +} + +let Predicates = [HasAVX, NoVLX] in { + def : Pat<(v4f32 (alignednontemporalload addr:$src)), + (VMOVNTDQArm addr:$src)>; + def : Pat<(v2f64 (alignednontemporalload addr:$src)), + (VMOVNTDQArm addr:$src)>; + def : Pat<(v2i64 (alignednontemporalload addr:$src)), + (VMOVNTDQArm addr:$src)>; +} + +let Predicates = [UseSSE41] in { + def : Pat<(v4f32 (alignednontemporalload addr:$src)), + (MOVNTDQArm addr:$src)>; + def : Pat<(v2f64 (alignednontemporalload addr:$src)), + (MOVNTDQArm addr:$src)>; + def : Pat<(v2i64 (alignednontemporalload addr:$src)), + (MOVNTDQArm addr:$src)>; +} + +} // AddedComplexity + +//===----------------------------------------------------------------------===// +// SSE4.2 - Compare Instructions +//===----------------------------------------------------------------------===// + +/// SS42I_binop_rm - Simple SSE 4.2 binary operator +multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType OpVT, RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop, X86FoldableSchedWrite sched, + bit Is2Addr = 1> { + def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, + Sched<[sched]>; + def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, + (OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>, + Sched<[sched.Folded, ReadAfterLd]>; +} + +let Predicates = [HasAVX] in + defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128, + loadv2i64, i128mem, SchedWriteVecALU.XMM, 0>, + VEX_4V, VEX_WIG; + +let Predicates = [HasAVX2] in + defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256, + loadv4i64, i256mem, SchedWriteVecALU.YMM, 0>, + VEX_4V, VEX_L, VEX_WIG; + +let Constraints = "$src1 = $dst" in + defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128, + memopv2i64, i128mem, SchedWriteVecALU.XMM>; + +//===----------------------------------------------------------------------===// +// SSE4.2 - String/text Processing Instructions +//===----------------------------------------------------------------------===// + +multiclass pcmpistrm_SS42AI<string asm> { + def rr : SS42AI<0x62, MRMSrcReg, (outs), + (ins VR128:$src1, VR128:$src2, u8imm:$src3), + !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), + []>, Sched<[WritePCmpIStrM]>; + let mayLoad = 1 in + def rm :SS42AI<0x62, MRMSrcMem, (outs), + (ins VR128:$src1, i128mem:$src2, u8imm:$src3), + !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), + []>, Sched<[WritePCmpIStrM.Folded, ReadAfterLd]>; +} + +let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in { + let Predicates = [HasAVX] in + defm VPCMPISTRM : pcmpistrm_SS42AI<"vpcmpistrm">, VEX; + defm PCMPISTRM : pcmpistrm_SS42AI<"pcmpistrm"> ; +} + +multiclass SS42AI_pcmpestrm<string asm> { + def rr : SS42AI<0x60, MRMSrcReg, (outs), + (ins VR128:$src1, VR128:$src3, u8imm:$src5), + !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), + []>, Sched<[WritePCmpEStrM]>; + let mayLoad = 1 in + def rm : SS42AI<0x60, MRMSrcMem, (outs), + (ins VR128:$src1, i128mem:$src3, u8imm:$src5), + !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), + []>, Sched<[WritePCmpEStrM.Folded, ReadAfterLd]>; +} + +let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in { + let Predicates = [HasAVX] in + defm VPCMPESTRM : SS42AI_pcmpestrm<"vpcmpestrm">, VEX; + defm PCMPESTRM : SS42AI_pcmpestrm<"pcmpestrm">; +} + +multiclass SS42AI_pcmpistri<string asm> { + def rr : SS42AI<0x63, MRMSrcReg, (outs), + (ins VR128:$src1, VR128:$src2, u8imm:$src3), + !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), + []>, Sched<[WritePCmpIStrI]>; + let mayLoad = 1 in + def rm : SS42AI<0x63, MRMSrcMem, (outs), + (ins VR128:$src1, i128mem:$src2, u8imm:$src3), + !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"), + []>, Sched<[WritePCmpIStrI.Folded, ReadAfterLd]>; +} + +let Defs = [ECX, EFLAGS], hasSideEffects = 0 in { + let Predicates = [HasAVX] in + defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX; + defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">; +} + +multiclass SS42AI_pcmpestri<string asm> { + def rr : SS42AI<0x61, MRMSrcReg, (outs), + (ins VR128:$src1, VR128:$src3, u8imm:$src5), + !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), + []>, Sched<[WritePCmpEStrI]>; + let mayLoad = 1 in + def rm : SS42AI<0x61, MRMSrcMem, (outs), + (ins VR128:$src1, i128mem:$src3, u8imm:$src5), + !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"), + []>, Sched<[WritePCmpEStrI.Folded, ReadAfterLd]>; +} + +let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in { + let Predicates = [HasAVX] in + defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX; + defm PCMPESTRI : SS42AI_pcmpestri<"pcmpestri">; +} + +//===----------------------------------------------------------------------===// +// SSE4.2 - CRC Instructions +//===----------------------------------------------------------------------===// + +// No CRC instructions have AVX equivalents + +// crc intrinsic instruction +// This set of instructions are only rm, the only difference is the size +// of r and m. +class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut, + RegisterClass RCIn, SDPatternOperator Int> : + SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2), + !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"), + [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))]>, + Sched<[WriteCRC32]>; + +class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut, + X86MemOperand x86memop, SDPatternOperator Int> : + SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2), + !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"), + [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))]>, + Sched<[WriteCRC32.Folded, ReadAfterLd]>; + +let Constraints = "$src1 = $dst" in { + def CRC32r32m8 : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem, + int_x86_sse42_crc32_32_8>; + def CRC32r32r8 : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8, + int_x86_sse42_crc32_32_8>; + def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem, + int_x86_sse42_crc32_32_16>, OpSize16; + def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16, + int_x86_sse42_crc32_32_16>, OpSize16; + def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem, + int_x86_sse42_crc32_32_32>, OpSize32; + def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32, + int_x86_sse42_crc32_32_32>, OpSize32; + def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem, + int_x86_sse42_crc32_64_64>, REX_W; + def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64, + int_x86_sse42_crc32_64_64>, REX_W; + let hasSideEffects = 0 in { + let mayLoad = 1 in + def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem, + null_frag>, REX_W; + def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8, + null_frag>, REX_W; + } +} + +//===----------------------------------------------------------------------===// +// SHA-NI Instructions +//===----------------------------------------------------------------------===// + +// FIXME: Is there a better scheduler class for SHA than WriteVecIMul? +multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId, + X86FoldableSchedWrite sched, bit UsesXMM0 = 0> { + def rr : I<Opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !if(UsesXMM0, + !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")), + [!if(UsesXMM0, + (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)), + (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>, + T8, Sched<[sched]>; + + def rm : I<Opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2), + !if(UsesXMM0, + !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"), + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")), + [!if(UsesXMM0, + (set VR128:$dst, (IntId VR128:$src1, + (bc_v4i32 (memopv2i64 addr:$src2)), XMM0)), + (set VR128:$dst, (IntId VR128:$src1, + (bc_v4i32 (memopv2i64 addr:$src2)))))]>, T8, + Sched<[sched.Folded, ReadAfterLd]>; +} + +let Constraints = "$src1 = $dst", Predicates = [HasSHA] in { + def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, u8imm:$src3), + "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set VR128:$dst, + (int_x86_sha1rnds4 VR128:$src1, VR128:$src2, + (i8 imm:$src3)))]>, TA, + Sched<[SchedWriteVecIMul.XMM]>; + def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2, u8imm:$src3), + "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set VR128:$dst, + (int_x86_sha1rnds4 VR128:$src1, + (bc_v4i32 (memopv2i64 addr:$src2)), + (i8 imm:$src3)))]>, TA, + Sched<[SchedWriteVecIMul.XMM.Folded, ReadAfterLd]>; + + defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte, + SchedWriteVecIMul.XMM>; + defm SHA1MSG1 : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1, + SchedWriteVecIMul.XMM>; + defm SHA1MSG2 : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2, + SchedWriteVecIMul.XMM>; + + let Uses=[XMM0] in + defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2, + SchedWriteVecIMul.XMM, 1>; + + defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1, + SchedWriteVecIMul.XMM>; + defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2, + SchedWriteVecIMul.XMM>; +} + +// Aliases with explicit %xmm0 +def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}", + (SHA256RNDS2rr VR128:$dst, VR128:$src2), 0>; +def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}", + (SHA256RNDS2rm VR128:$dst, i128mem:$src2), 0>; + +//===----------------------------------------------------------------------===// +// AES-NI Instructions +//===----------------------------------------------------------------------===// + +multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, + Intrinsic IntId, PatFrag ld_frag, + bit Is2Addr = 0, RegisterClass RC = VR128, + X86MemOperand MemOp = i128mem> { + let AsmString = OpcodeStr## + !if(Is2Addr, "\t{$src2, $dst|$dst, $src2}", + "\t{$src2, $src1, $dst|$dst, $src1, $src2}") in { + def rr : AES8I<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2), "", + [(set RC:$dst, (IntId RC:$src1, RC:$src2))]>, + Sched<[WriteAESDecEnc]>; + def rm : AES8I<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, MemOp:$src2), "", + [(set RC:$dst, (IntId RC:$src1, (ld_frag addr:$src2)))]>, + Sched<[WriteAESDecEnc.Folded, ReadAfterLd]>; + } +} + +// Perform One Round of an AES Encryption/Decryption Flow +let Predicates = [HasAVX, NoVLX_Or_NoVAES, HasAES] in { + defm VAESENC : AESI_binop_rm_int<0xDC, "vaesenc", + int_x86_aesni_aesenc, loadv2i64>, VEX_4V, VEX_WIG; + defm VAESENCLAST : AESI_binop_rm_int<0xDD, "vaesenclast", + int_x86_aesni_aesenclast, loadv2i64>, VEX_4V, VEX_WIG; + defm VAESDEC : AESI_binop_rm_int<0xDE, "vaesdec", + int_x86_aesni_aesdec, loadv2i64>, VEX_4V, VEX_WIG; + defm VAESDECLAST : AESI_binop_rm_int<0xDF, "vaesdeclast", + int_x86_aesni_aesdeclast, loadv2i64>, VEX_4V, VEX_WIG; +} + +let Predicates = [NoVLX, HasVAES] in { + defm VAESENCY : AESI_binop_rm_int<0xDC, "vaesenc", + int_x86_aesni_aesenc_256, loadv4i64, 0, VR256, + i256mem>, VEX_4V, VEX_L, VEX_WIG; + defm VAESENCLASTY : AESI_binop_rm_int<0xDD, "vaesenclast", + int_x86_aesni_aesenclast_256, loadv4i64, 0, VR256, + i256mem>, VEX_4V, VEX_L, VEX_WIG; + defm VAESDECY : AESI_binop_rm_int<0xDE, "vaesdec", + int_x86_aesni_aesdec_256, loadv4i64, 0, VR256, + i256mem>, VEX_4V, VEX_L, VEX_WIG; + defm VAESDECLASTY : AESI_binop_rm_int<0xDF, "vaesdeclast", + int_x86_aesni_aesdeclast_256, loadv4i64, 0, VR256, + i256mem>, VEX_4V, VEX_L, VEX_WIG; +} + +let Constraints = "$src1 = $dst" in { + defm AESENC : AESI_binop_rm_int<0xDC, "aesenc", + int_x86_aesni_aesenc, memopv2i64, 1>; + defm AESENCLAST : AESI_binop_rm_int<0xDD, "aesenclast", + int_x86_aesni_aesenclast, memopv2i64, 1>; + defm AESDEC : AESI_binop_rm_int<0xDE, "aesdec", + int_x86_aesni_aesdec, memopv2i64, 1>; + defm AESDECLAST : AESI_binop_rm_int<0xDF, "aesdeclast", + int_x86_aesni_aesdeclast, memopv2i64, 1>; +} + +// Perform the AES InvMixColumn Transformation +let Predicates = [HasAVX, HasAES] in { + def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1), + "vaesimc\t{$src1, $dst|$dst, $src1}", + [(set VR128:$dst, + (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>, + VEX, VEX_WIG; + def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), + (ins i128mem:$src1), + "vaesimc\t{$src1, $dst|$dst, $src1}", + [(set VR128:$dst, (int_x86_aesni_aesimc (loadv2i64 addr:$src1)))]>, + Sched<[WriteAESIMC.Folded]>, VEX, VEX_WIG; +} +def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1), + "aesimc\t{$src1, $dst|$dst, $src1}", + [(set VR128:$dst, + (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>; +def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst), + (ins i128mem:$src1), + "aesimc\t{$src1, $dst|$dst, $src1}", + [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>, + Sched<[WriteAESIMC.Folded]>; + +// AES Round Key Generation Assist +let Predicates = [HasAVX, HasAES] in { + def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, u8imm:$src2), + "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, + (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>, + Sched<[WriteAESKeyGen]>, VEX, VEX_WIG; + def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), + (ins i128mem:$src1, u8imm:$src2), + "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, + (int_x86_aesni_aeskeygenassist (loadv2i64 addr:$src1), imm:$src2))]>, + Sched<[WriteAESKeyGen.Folded]>, VEX, VEX_WIG; +} +def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, u8imm:$src2), + "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, + (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>, + Sched<[WriteAESKeyGen]>; +def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), + (ins i128mem:$src1, u8imm:$src2), + "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, + (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>, + Sched<[WriteAESKeyGen.Folded]>; + +//===----------------------------------------------------------------------===// +// PCLMUL Instructions +//===----------------------------------------------------------------------===// + +// Immediate transform to help with commuting. +def PCLMULCommuteImm : SDNodeXForm<imm, [{ + uint8_t Imm = N->getZExtValue(); + return getI8Imm((uint8_t)((Imm >> 4) | (Imm << 4)), SDLoc(N)); +}]>; + +// SSE carry-less Multiplication instructions +let Predicates = [NoAVX, HasPCLMUL] in { + let Constraints = "$src1 = $dst" in { + let isCommutable = 1 in + def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, u8imm:$src3), + "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set VR128:$dst, + (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>, + Sched<[WriteCLMul]>; + + def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2, u8imm:$src3), + "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set VR128:$dst, + (int_x86_pclmulqdq VR128:$src1, (memopv2i64 addr:$src2), + imm:$src3))]>, + Sched<[WriteCLMul.Folded, ReadAfterLd]>; + } // Constraints = "$src1 = $dst" + + def : Pat<(int_x86_pclmulqdq (memopv2i64 addr:$src2), VR128:$src1, + (i8 imm:$src3)), + (PCLMULQDQrm VR128:$src1, addr:$src2, + (PCLMULCommuteImm imm:$src3))>; +} // Predicates = [NoAVX, HasPCLMUL] + +// SSE aliases +foreach HI = ["hq","lq"] in +foreach LO = ["hq","lq"] in { + def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}", + (PCLMULQDQrr VR128:$dst, VR128:$src, + !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>; + def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}", + (PCLMULQDQrm VR128:$dst, i128mem:$src, + !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>; +} + +// AVX carry-less Multiplication instructions +multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp, + PatFrag LdFrag, Intrinsic IntId> { + let isCommutable = 1 in + def rr : PCLMULIi8<0x44, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, u8imm:$src3), + "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set RC:$dst, + (IntId RC:$src1, RC:$src2, imm:$src3))]>, + Sched<[WriteCLMul]>; + + def rm : PCLMULIi8<0x44, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, MemOp:$src2, u8imm:$src3), + "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set RC:$dst, + (IntId RC:$src1, (LdFrag addr:$src2), imm:$src3))]>, + Sched<[WriteCLMul.Folded, ReadAfterLd]>; + + // We can commute a load in the first operand by swapping the sources and + // rotating the immediate. + def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 imm:$src3)), + (!cast<Instruction>(NAME#"rm") RC:$src1, addr:$src2, + (PCLMULCommuteImm imm:$src3))>; +} + +let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in +defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, loadv2i64, + int_x86_pclmulqdq>, VEX_4V, VEX_WIG; + +let Predicates = [NoVLX, HasVPCLMULQDQ] in +defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, loadv4i64, + int_x86_pclmulqdq_256>, VEX_4V, VEX_L, VEX_WIG; + +multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC, + X86MemOperand MemOp, string Hi, string Lo> { + def : InstAlias<"vpclmul"##Hi##Lo##"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}", + (!cast<Instruction>(InstStr # "rr") RC:$dst, RC:$src1, RC:$src2, + !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>; + def : InstAlias<"vpclmul"##Hi##Lo##"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}", + (!cast<Instruction>(InstStr # "rm") RC:$dst, RC:$src1, MemOp:$src2, + !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>; +} + +multiclass vpclmulqdq_aliases<string InstStr, RegisterClass RC, + X86MemOperand MemOp> { + defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "hq">; + defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "lq">; + defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "hq">; + defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "lq">; +} + +// AVX aliases +defm : vpclmulqdq_aliases<"VPCLMULQDQ", VR128, i128mem>; +defm : vpclmulqdq_aliases<"VPCLMULQDQY", VR256, i256mem>; + +//===----------------------------------------------------------------------===// +// SSE4A Instructions +//===----------------------------------------------------------------------===// + +let Predicates = [HasSSE4A] in { + +let ExeDomain = SSEPackedInt in { +let Constraints = "$src = $dst" in { +def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst), + (ins VR128:$src, u8imm:$len, u8imm:$idx), + "extrq\t{$idx, $len, $src|$src, $len, $idx}", + [(set VR128:$dst, (X86extrqi VR128:$src, imm:$len, + imm:$idx))]>, + PD, Sched<[SchedWriteVecALU.XMM]>; +def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src, VR128:$mask), + "extrq\t{$mask, $src|$src, $mask}", + [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src, + VR128:$mask))]>, + PD, Sched<[SchedWriteVecALU.XMM]>; + +def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx), + "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}", + [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2, + imm:$len, imm:$idx))]>, + XD, Sched<[SchedWriteVecALU.XMM]>; +def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src, VR128:$mask), + "insertq\t{$mask, $src|$src, $mask}", + [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src, + VR128:$mask))]>, + XD, Sched<[SchedWriteVecALU.XMM]>; +} +} // ExeDomain = SSEPackedInt + +// Non-temporal (unaligned) scalar stores. +let AddedComplexity = 400 in { // Prefer non-temporal versions +let hasSideEffects = 0, mayStore = 1, SchedRW = [SchedWriteFMoveLSNT.Scl.MR] in { +def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src), + "movntss\t{$src, $dst|$dst, $src}", []>, XS; + +def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), + "movntsd\t{$src, $dst|$dst, $src}", []>, XD; +} // SchedRW + +def : Pat<(nontemporalstore FR32:$src, addr:$dst), + (MOVNTSS addr:$dst, (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>; + +def : Pat<(nontemporalstore FR64:$src, addr:$dst), + (MOVNTSD addr:$dst, (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>; + +} // AddedComplexity +} // HasSSE4A + +//===----------------------------------------------------------------------===// +// AVX Instructions +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// VBROADCAST - Load from memory and broadcast to all elements of the +// destination operand +// +class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC, + X86MemOperand x86memop, ValueType VT, + PatFrag ld_frag, SchedWrite Sched> : + AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (VT (X86VBroadcast (ld_frag addr:$src))))]>, + Sched<[Sched]>, VEX; + +// AVX2 adds register forms +class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC, + ValueType ResVT, ValueType OpVT, SchedWrite Sched> : + AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>, + Sched<[Sched]>, VEX; + +let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in { + def VBROADCASTSSrm : avx_broadcast_rm<0x18, "vbroadcastss", VR128, + f32mem, v4f32, loadf32, + SchedWriteFShuffle.XMM.Folded>; + def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256, + f32mem, v8f32, loadf32, + SchedWriteFShuffle.XMM.Folded>, VEX_L; +} +let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in +def VBROADCASTSDYrm : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem, + v4f64, loadf64, + SchedWriteFShuffle.XMM.Folded>, VEX_L; + +let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in { + def VBROADCASTSSrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR128, + v4f32, v4f32, SchedWriteFShuffle.XMM>; + def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256, + v8f32, v4f32, WriteFShuffle256>, VEX_L; +} +let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in +def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256, + v4f64, v2f64, WriteFShuffle256>, VEX_L; + +let Predicates = [HasAVX, NoVLX] in { + def : Pat<(v4f32 (X86VBroadcast (v4f32 (scalar_to_vector (loadf32 addr:$src))))), + (VBROADCASTSSrm addr:$src)>; + def : Pat<(v8f32 (X86VBroadcast (v4f32 (scalar_to_vector (loadf32 addr:$src))))), + (VBROADCASTSSYrm addr:$src)>; + def : Pat<(v4f64 (X86VBroadcast (v2f64 (scalar_to_vector (loadf64 addr:$src))))), + (VBROADCASTSDYrm addr:$src)>; +} + +//===----------------------------------------------------------------------===// +// VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both +// halves of a 256-bit vector. +// +let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in +def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst), + (ins i128mem:$src), + "vbroadcasti128\t{$src, $dst|$dst, $src}", []>, + Sched<[WriteShuffleLd]>, VEX, VEX_L; + +let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX], + ExeDomain = SSEPackedSingle in +def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst), + (ins f128mem:$src), + "vbroadcastf128\t{$src, $dst|$dst, $src}", []>, + Sched<[SchedWriteFShuffle.XMM.Folded]>, VEX, VEX_L; + +let Predicates = [HasAVX2, NoVLX] in { +def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))), + (VBROADCASTI128 addr:$src)>; +def : Pat<(v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src)))), + (VBROADCASTI128 addr:$src)>; +def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))), + (VBROADCASTI128 addr:$src)>; +def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))), + (VBROADCASTI128 addr:$src)>; +} + +let Predicates = [HasAVX, NoVLX] in { +def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))), + (VBROADCASTF128 addr:$src)>; +def : Pat<(v8f32 (X86SubVBroadcast (loadv4f32 addr:$src))), + (VBROADCASTF128 addr:$src)>; +} + +let Predicates = [HasAVX1Only] in { +def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))), + (VBROADCASTF128 addr:$src)>; +def : Pat<(v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src)))), + (VBROADCASTF128 addr:$src)>; +def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))), + (VBROADCASTF128 addr:$src)>; +def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))), + (VBROADCASTF128 addr:$src)>; +} + +//===----------------------------------------------------------------------===// +// VINSERTF128 - Insert packed floating-point values +// +let hasSideEffects = 0, ExeDomain = SSEPackedSingle in { +def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR128:$src2, u8imm:$src3), + "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, Sched<[WriteFShuffle256]>, VEX_4V, VEX_L; +let mayLoad = 1 in +def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, f128mem:$src2, u8imm:$src3), + "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, Sched<[WriteFShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L; +} + +// To create a 256-bit all ones value, we should produce VCMPTRUEPS +// with YMM register containing zero. +// FIXME: Avoid producing vxorps to clear the fake inputs. +let Predicates = [HasAVX1Only] in { +def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>; +} + +multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To, + PatFrag memop_frag> { + def : Pat<(vinsert128_insert:$ins (To VR256:$src1), (From VR128:$src2), + (iPTR imm)), + (!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2, + (INSERT_get_vinsert128_imm VR256:$ins))>; + def : Pat<(vinsert128_insert:$ins (To VR256:$src1), + (From (bitconvert (memop_frag addr:$src2))), + (iPTR imm)), + (!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2, + (INSERT_get_vinsert128_imm VR256:$ins))>; +} + +let Predicates = [HasAVX, NoVLX] in { + defm : vinsert_lowering<"VINSERTF128", v4f32, v8f32, loadv4f32>; + defm : vinsert_lowering<"VINSERTF128", v2f64, v4f64, loadv2f64>; +} + +let Predicates = [HasAVX1Only] in { + defm : vinsert_lowering<"VINSERTF128", v2i64, v4i64, loadv2i64>; + defm : vinsert_lowering<"VINSERTF128", v4i32, v8i32, loadv2i64>; + defm : vinsert_lowering<"VINSERTF128", v8i16, v16i16, loadv2i64>; + defm : vinsert_lowering<"VINSERTF128", v16i8, v32i8, loadv2i64>; +} + +//===----------------------------------------------------------------------===// +// VEXTRACTF128 - Extract packed floating-point values +// +let hasSideEffects = 0, ExeDomain = SSEPackedSingle in { +def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst), + (ins VR256:$src1, u8imm:$src2), + "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, Sched<[WriteFShuffle256]>, VEX, VEX_L; +let mayStore = 1 in +def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs), + (ins f128mem:$dst, VR256:$src1, u8imm:$src2), + "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, Sched<[WriteFStoreX]>, VEX, VEX_L; +} + +multiclass vextract_lowering<string InstrStr, ValueType From, ValueType To> { + def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)), + (To (!cast<Instruction>(InstrStr#rr) + (From VR256:$src1), + (EXTRACT_get_vextract128_imm VR128:$ext)))>; + def : Pat<(store (To (vextract128_extract:$ext (From VR256:$src1), + (iPTR imm))), addr:$dst), + (!cast<Instruction>(InstrStr#mr) addr:$dst, VR256:$src1, + (EXTRACT_get_vextract128_imm VR128:$ext))>; +} + +// AVX1 patterns +let Predicates = [HasAVX, NoVLX] in { + defm : vextract_lowering<"VEXTRACTF128", v8f32, v4f32>; + defm : vextract_lowering<"VEXTRACTF128", v4f64, v2f64>; +} + +let Predicates = [HasAVX1Only] in { + defm : vextract_lowering<"VEXTRACTF128", v4i64, v2i64>; + defm : vextract_lowering<"VEXTRACTF128", v8i32, v4i32>; + defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>; + defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>; +} + +//===----------------------------------------------------------------------===// +// VMASKMOV - Conditional SIMD Packed Loads and Stores +// +multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr, + Intrinsic IntLd, Intrinsic IntLd256, + Intrinsic IntSt, Intrinsic IntSt256> { + def rm : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, f128mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>, + VEX_4V, Sched<[WriteFMaskedLoad]>; + def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, f256mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, + VEX_4V, VEX_L, Sched<[WriteFMaskedLoadY]>; + def mr : AVX8I<opc_mr, MRMDestMem, (outs), + (ins f128mem:$dst, VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>, + VEX_4V, Sched<[WriteFMaskedStore]>; + def Ymr : AVX8I<opc_mr, MRMDestMem, (outs), + (ins f256mem:$dst, VR256:$src1, VR256:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, + VEX_4V, VEX_L, Sched<[WriteFMaskedStoreY]>; +} + +let ExeDomain = SSEPackedSingle in +defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps", + int_x86_avx_maskload_ps, + int_x86_avx_maskload_ps_256, + int_x86_avx_maskstore_ps, + int_x86_avx_maskstore_ps_256>; +let ExeDomain = SSEPackedDouble in +defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd", + int_x86_avx_maskload_pd, + int_x86_avx_maskload_pd_256, + int_x86_avx_maskstore_pd, + int_x86_avx_maskstore_pd_256>; + +//===----------------------------------------------------------------------===// +// VPERMIL - Permute Single and Double Floating-Point Values +// + +multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr, + RegisterClass RC, X86MemOperand x86memop_f, + X86MemOperand x86memop_i, PatFrag i_frag, + ValueType f_vt, ValueType i_vt, + X86FoldableSchedWrite sched, + X86FoldableSchedWrite varsched> { + let Predicates = [HasAVX, NoVLX] in { + def rr : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX_4V, + Sched<[varsched]>; + def rm : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop_i:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, + (i_vt (bitconvert (i_frag addr:$src2))))))]>, VEX_4V, + Sched<[varsched.Folded, ReadAfterLd]>; + + def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, u8imm:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX, + Sched<[sched]>; + def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst), + (ins x86memop_f:$src1, u8imm:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set RC:$dst, + (f_vt (X86VPermilpi (load addr:$src1), (i8 imm:$src2))))]>, VEX, + Sched<[sched.Folded]>; + }// Predicates = [HasAVX, NoVLX] +} + +let ExeDomain = SSEPackedSingle in { + defm VPERMILPS : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem, + loadv2i64, v4f32, v4i32, SchedWriteFShuffle.XMM, + SchedWriteFVarShuffle.XMM>; + defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem, + loadv4i64, v8f32, v8i32, SchedWriteFShuffle.YMM, + SchedWriteFVarShuffle.YMM>, VEX_L; +} +let ExeDomain = SSEPackedDouble in { + defm VPERMILPD : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem, + loadv2i64, v2f64, v2i64, SchedWriteFShuffle.XMM, + SchedWriteFVarShuffle.XMM>; + defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem, + loadv4i64, v4f64, v4i64, SchedWriteFShuffle.YMM, + SchedWriteFVarShuffle.YMM>, VEX_L; +} + +//===----------------------------------------------------------------------===// +// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks +// + +let ExeDomain = SSEPackedSingle in { +let isCommutable = 1 in +def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, u8imm:$src3), + "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set VR256:$dst, (v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, + (i8 imm:$src3))))]>, VEX_4V, VEX_L, + Sched<[WriteFShuffle256]>; +def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, f256mem:$src2, u8imm:$src3), + "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4f64 addr:$src2), + (i8 imm:$src3)))]>, VEX_4V, VEX_L, + Sched<[WriteFShuffle256Ld, ReadAfterLd]>; +} + +// Immediate transform to help with commuting. +def Perm2XCommuteImm : SDNodeXForm<imm, [{ + return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N)); +}]>; + +let Predicates = [HasAVX] in { +// Pattern with load in other operand. +def : Pat<(v4f64 (X86VPerm2x128 (loadv4f64 addr:$src2), + VR256:$src1, (i8 imm:$imm))), + (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>; +} + +let Predicates = [HasAVX1Only] in { +def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), + (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; +def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, + (loadv4i64 addr:$src2), (i8 imm:$imm))), + (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; +// Pattern with load in other operand. +def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2), + VR256:$src1, (i8 imm:$imm))), + (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>; +} + +//===----------------------------------------------------------------------===// +// VZERO - Zero YMM registers +// Note: These instruction do not affect the YMM16-YMM31. +// + +let SchedRW = [WriteSystem] in { +let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7, + YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in { + // Zero All YMM registers + def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall", + [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L, + Requires<[HasAVX]>, VEX_WIG; + + // Zero Upper bits of YMM registers + def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper", + [(int_x86_avx_vzeroupper)]>, PS, VEX, + Requires<[HasAVX]>, VEX_WIG; +} // Defs +} // SchedRW + +//===----------------------------------------------------------------------===// +// Half precision conversion instructions +// + +multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, + X86FoldableSchedWrite sched> { + def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), + "vcvtph2ps\t{$src, $dst|$dst, $src}", + [(set RC:$dst, (X86cvtph2ps VR128:$src))]>, + T8PD, VEX, Sched<[sched]>; + let hasSideEffects = 0, mayLoad = 1 in + def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), + "vcvtph2ps\t{$src, $dst|$dst, $src}", + [(set RC:$dst, (X86cvtph2ps (bc_v8i16 + (loadv2i64 addr:$src))))]>, + T8PD, VEX, Sched<[sched.Folded]>; +} + +multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, + SchedWrite RR, SchedWrite MR> { + def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst), + (ins RC:$src1, i32u8imm:$src2), + "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR128:$dst, (X86cvtps2ph RC:$src1, imm:$src2))]>, + TAPD, VEX, Sched<[RR]>; + let hasSideEffects = 0, mayStore = 1 in + def mr : Ii8<0x1D, MRMDestMem, (outs), + (ins x86memop:$dst, RC:$src1, i32u8imm:$src2), + "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, + TAPD, VEX, Sched<[MR]>; +} + +let Predicates = [HasF16C, NoVLX] in { + defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, WriteCvtPH2PS>; + defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, WriteCvtPH2PSY>, VEX_L; + defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, WriteCvtPS2PH, + WriteCvtPS2PHSt>; + defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, WriteCvtPS2PHY, + WriteCvtPS2PHYSt>, VEX_L; + + // Pattern match vcvtph2ps of a scalar i64 load. + def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzmovl_v2i64 addr:$src)))), + (VCVTPH2PSrm addr:$src)>; + def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzload_v2i64 addr:$src)))), + (VCVTPH2PSrm addr:$src)>; + def : Pat<(v4f32 (X86cvtph2ps (v8i16 (bitconvert + (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), + (VCVTPH2PSrm addr:$src)>; + + def : Pat<(store (f64 (extractelt + (bc_v2f64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))), + (iPTR 0))), addr:$dst), + (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>; + def : Pat<(store (i64 (extractelt + (bc_v2i64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))), + (iPTR 0))), addr:$dst), + (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>; + def : Pat<(store (v8i16 (X86cvtps2ph VR256:$src1, i32:$src2)), addr:$dst), + (VCVTPS2PHYmr addr:$dst, VR256:$src1, imm:$src2)>; +} + +// Patterns for matching conversions from float to half-float and vice versa. +let Predicates = [HasF16C, NoVLX] in { + // Use MXCSR.RC for rounding instead of explicitly specifying the default + // rounding mode (Nearest-Even, encoded as 0). Both are equivalent in the + // configurations we support (the default). However, falling back to MXCSR is + // more consistent with other instructions, which are always controlled by it. + // It's encoded as 0b100. + def : Pat<(fp_to_f16 FR32:$src), + (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (v8i16 (VCVTPS2PHrr + (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 4))), sub_16bit))>; + + def : Pat<(f16_to_fp GR16:$src), + (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSrr + (v4i32 (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)))), FR32)) >; + + def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32:$src))), + (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSrr + (v8i16 (VCVTPS2PHrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 4)))), FR32)) >; +} + +//===----------------------------------------------------------------------===// +// AVX2 Instructions +//===----------------------------------------------------------------------===// + +/// AVX2_blend_rmi - AVX2 blend with 8-bit immediate +multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType OpVT, X86FoldableSchedWrite sched, + RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop, SDNodeXForm commuteXForm> { + let isCommutable = 1 in + def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, u8imm:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>, + Sched<[sched]>, VEX_4V; + def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2, u8imm:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set RC:$dst, + (OpVT (OpNode RC:$src1, + (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>, + Sched<[sched.Folded, ReadAfterLd]>, VEX_4V; + + // Pattern to commute if load is in first source. + def : Pat<(OpVT (OpNode (bitconvert (memop_frag addr:$src2)), + RC:$src1, imm:$src3)), + (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2, + (commuteXForm imm:$src3))>; +} + +defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32, + SchedWriteBlend.XMM, VR128, loadv2i64, i128mem, + BlendCommuteImm4>; +defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32, + SchedWriteBlend.YMM, VR256, loadv4i64, i256mem, + BlendCommuteImm8>, VEX_L; + +// For insertion into the zero index (low half) of a 256-bit vector, it is +// more efficient to generate a blend with immediate instead of an insert*128. +let Predicates = [HasAVX2] in { +def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)), + (VPBLENDDYrri VR256:$src1, + (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src2, sub_xmm), 0xf)>; +def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)), + (VPBLENDDYrri VR256:$src1, + (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src2, sub_xmm), 0xf)>; +def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)), + (VPBLENDDYrri VR256:$src1, + (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src2, sub_xmm), 0xf)>; +def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)), + (VPBLENDDYrri VR256:$src1, + (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src2, sub_xmm), 0xf)>; +} + +let Predicates = [HasAVX1Only] in { +def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)), + (VBLENDPSYrri VR256:$src1, + (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src2, sub_xmm), 0xf)>; +def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)), + (VBLENDPSYrri VR256:$src1, + (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src2, sub_xmm), 0xf)>; +def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)), + (VBLENDPSYrri VR256:$src1, + (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src2, sub_xmm), 0xf)>; +def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)), + (VBLENDPSYrri VR256:$src1, + (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + VR128:$src2, sub_xmm), 0xf)>; +} + +//===----------------------------------------------------------------------===// +// VPBROADCAST - Load from memory and broadcast to all elements of the +// destination operand +// +multiclass avx2_broadcast<bits<8> opc, string OpcodeStr, + X86MemOperand x86memop, PatFrag ld_frag, + ValueType OpVT128, ValueType OpVT256, Predicate prd> { + let Predicates = [HasAVX2, prd] in { + def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, + (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>, + Sched<[SchedWriteShuffle.XMM]>, VEX; + def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, + (OpVT128 (X86VBroadcast (ld_frag addr:$src))))]>, + Sched<[SchedWriteShuffle.XMM.Folded]>, VEX; + def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, + (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>, + Sched<[WriteShuffle256]>, VEX, VEX_L; + def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, + (OpVT256 (X86VBroadcast (ld_frag addr:$src))))]>, + Sched<[SchedWriteShuffle.XMM.Folded]>, VEX, VEX_L; + + // Provide aliases for broadcast from the same register class that + // automatically does the extract. + def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))), + (!cast<Instruction>(NAME#"Yrr") + (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>; + } +} + +defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8, + v16i8, v32i8, NoVLX_Or_NoBWI>; +defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16, + v8i16, v16i16, NoVLX_Or_NoBWI>; +defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32, + v4i32, v8i32, NoVLX>; +defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64, + v2i64, v4i64, NoVLX>; + +let Predicates = [HasAVX2, NoVLX] in { + // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD. + def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))), + (VPBROADCASTQrm addr:$src)>; + def : Pat<(v4i64 (X86VBroadcast (v4i64 (X86vzload addr:$src)))), + (VPBROADCASTQYrm addr:$src)>; + + def : Pat<(v4i32 (X86VBroadcast (v4i32 (scalar_to_vector (loadi32 addr:$src))))), + (VPBROADCASTDrm addr:$src)>; + def : Pat<(v8i32 (X86VBroadcast (v4i32 (scalar_to_vector (loadi32 addr:$src))))), + (VPBROADCASTDYrm addr:$src)>; + def : Pat<(v2i64 (X86VBroadcast (v2i64 (scalar_to_vector (loadi64 addr:$src))))), + (VPBROADCASTQrm addr:$src)>; + def : Pat<(v4i64 (X86VBroadcast (v2i64 (scalar_to_vector (loadi64 addr:$src))))), + (VPBROADCASTQYrm addr:$src)>; +} +let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { + // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably. + // This means we'll encounter truncated i32 loads; match that here. + def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), + (VPBROADCASTWrm addr:$src)>; + def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), + (VPBROADCASTWYrm addr:$src)>; + def : Pat<(v8i16 (X86VBroadcast + (i16 (trunc (i32 (zextloadi16 addr:$src)))))), + (VPBROADCASTWrm addr:$src)>; + def : Pat<(v16i16 (X86VBroadcast + (i16 (trunc (i32 (zextloadi16 addr:$src)))))), + (VPBROADCASTWYrm addr:$src)>; +} + +let Predicates = [HasAVX2, NoVLX] in { + // Provide aliases for broadcast from the same register class that + // automatically does the extract. + def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))), + (VBROADCASTSSYrr (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), + sub_xmm)))>; + def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256:$src))), + (VBROADCASTSDYrr (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), + sub_xmm)))>; +} + +let Predicates = [HasAVX2, NoVLX] in { + // Provide fallback in case the load node that is used in the patterns above + // is used by additional users, which prevents the pattern selection. + def : Pat<(v4f32 (X86VBroadcast FR32:$src)), + (VBROADCASTSSrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>; + def : Pat<(v8f32 (X86VBroadcast FR32:$src)), + (VBROADCASTSSYrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)))>; + def : Pat<(v4f64 (X86VBroadcast FR64:$src)), + (VBROADCASTSDYrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>; +} + +let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { + def : Pat<(v16i8 (X86VBroadcast GR8:$src)), + (VPBROADCASTBrr (v16i8 (COPY_TO_REGCLASS + (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), + GR8:$src, sub_8bit)), + VR128)))>; + def : Pat<(v32i8 (X86VBroadcast GR8:$src)), + (VPBROADCASTBYrr (v16i8 (COPY_TO_REGCLASS + (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), + GR8:$src, sub_8bit)), + VR128)))>; + + def : Pat<(v8i16 (X86VBroadcast GR16:$src)), + (VPBROADCASTWrr (v8i16 (COPY_TO_REGCLASS + (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), + GR16:$src, sub_16bit)), + VR128)))>; + def : Pat<(v16i16 (X86VBroadcast GR16:$src)), + (VPBROADCASTWYrr (v8i16 (COPY_TO_REGCLASS + (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), + GR16:$src, sub_16bit)), + VR128)))>; +} +let Predicates = [HasAVX2, NoVLX] in { + def : Pat<(v4i32 (X86VBroadcast GR32:$src)), + (VPBROADCASTDrr (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)))>; + def : Pat<(v8i32 (X86VBroadcast GR32:$src)), + (VPBROADCASTDYrr (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)))>; + def : Pat<(v2i64 (X86VBroadcast GR64:$src)), + (VPBROADCASTQrr (v2i64 (COPY_TO_REGCLASS GR64:$src, VR128)))>; + def : Pat<(v4i64 (X86VBroadcast GR64:$src)), + (VPBROADCASTQYrr (v2i64 (COPY_TO_REGCLASS GR64:$src, VR128)))>; +} + +// AVX1 broadcast patterns +let Predicates = [HasAVX1Only] in { +def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))), + (VBROADCASTSSYrm addr:$src)>; +def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))), + (VBROADCASTSDYrm addr:$src)>; +def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), + (VBROADCASTSSrm addr:$src)>; +} + + // Provide fallback in case the load node that is used in the patterns above + // is used by additional users, which prevents the pattern selection. +let Predicates = [HasAVX, NoVLX] in { + // 128bit broadcasts: + def : Pat<(v2f64 (X86VBroadcast f64:$src)), + (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))>; + def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), + (VMOVDDUPrm addr:$src)>; + + def : Pat<(v2f64 (X86VBroadcast v2f64:$src)), + (VMOVDDUPrr VR128:$src)>; + def : Pat<(v2f64 (X86VBroadcast (loadv2f64 addr:$src))), + (VMOVDDUPrm addr:$src)>; +} + +let Predicates = [HasAVX1Only] in { + def : Pat<(v4f32 (X86VBroadcast FR32:$src)), + (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)>; + def : Pat<(v8f32 (X86VBroadcast FR32:$src)), + (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), + (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), sub_xmm), + (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), 1)>; + def : Pat<(v4f64 (X86VBroadcast FR64:$src)), + (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), + (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), sub_xmm), + (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), 1)>; + + def : Pat<(v4i32 (X86VBroadcast GR32:$src)), + (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)>; + def : Pat<(v8i32 (X86VBroadcast GR32:$src)), + (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), + (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)), sub_xmm), + (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR32:$src, VR128)), 0)), 1)>; + def : Pat<(v4i64 (X86VBroadcast GR64:$src)), + (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), + (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)), sub_xmm), + (v4i32 (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)), 1)>; + + def : Pat<(v2i64 (X86VBroadcast i64:$src)), + (VPSHUFDri (v4i32 (COPY_TO_REGCLASS GR64:$src, VR128)), 0x44)>; + def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))), + (VMOVDDUPrm addr:$src)>; +} + +//===----------------------------------------------------------------------===// +// VPERM - Permute instructions +// + +multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, + ValueType OpVT, X86FoldableSchedWrite Sched, + X86MemOperand memOp> { + let Predicates = [HasAVX2, NoVLX] in { + def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>, + Sched<[Sched]>, VEX_4V, VEX_L; + def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, memOp:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (OpVT (X86VPermv VR256:$src1, + (bitconvert (mem_frag addr:$src2)))))]>, + Sched<[Sched.Folded, ReadAfterLd]>, VEX_4V, VEX_L; + } +} + +defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32, WriteVarShuffle256, + i256mem>; +let ExeDomain = SSEPackedSingle in +defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFVarShuffle256, + f256mem>; + +multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, + ValueType OpVT, X86FoldableSchedWrite Sched, + X86MemOperand memOp> { + let Predicates = [HasAVX2, NoVLX] in { + def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, u8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>, + Sched<[Sched]>, VEX, VEX_L; + def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst), + (ins memOp:$src1, u8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (OpVT (X86VPermi (mem_frag addr:$src1), + (i8 imm:$src2))))]>, + Sched<[Sched.Folded, ReadAfterLd]>, VEX, VEX_L; + } +} + +defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64, + WriteShuffle256, i256mem>, VEX_W; +let ExeDomain = SSEPackedDouble in +defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64, + WriteFShuffle256, f256mem>, VEX_W; + +//===----------------------------------------------------------------------===// +// VPERM2I128 - Permute Floating-Point Values in 128-bit chunks +// +let isCommutable = 1 in +def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, u8imm:$src3), + "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, + (i8 imm:$src3))))]>, Sched<[WriteShuffle256]>, + VEX_4V, VEX_L; +def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, f256mem:$src2, u8imm:$src3), + "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2), + (i8 imm:$src3)))]>, + Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L; + +let Predicates = [HasAVX2] in +def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2), + VR256:$src1, (i8 imm:$imm))), + (VPERM2I128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>; + + +//===----------------------------------------------------------------------===// +// VINSERTI128 - Insert packed integer values +// +let hasSideEffects = 0 in { +def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR128:$src2, u8imm:$src3), + "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L; +let mayLoad = 1 in +def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, i128mem:$src2, u8imm:$src3), + "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L; +} + +let Predicates = [HasAVX2, NoVLX] in { + defm : vinsert_lowering<"VINSERTI128", v2i64, v4i64, loadv2i64>; + defm : vinsert_lowering<"VINSERTI128", v4i32, v8i32, loadv2i64>; + defm : vinsert_lowering<"VINSERTI128", v8i16, v16i16, loadv2i64>; + defm : vinsert_lowering<"VINSERTI128", v16i8, v32i8, loadv2i64>; +} + +//===----------------------------------------------------------------------===// +// VEXTRACTI128 - Extract packed integer values +// +def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst), + (ins VR256:$src1, u8imm:$src2), + "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, + Sched<[WriteShuffle256]>, VEX, VEX_L; +let hasSideEffects = 0, mayStore = 1 in +def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs), + (ins i128mem:$dst, VR256:$src1, u8imm:$src2), + "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, + Sched<[SchedWriteVecMoveLS.XMM.MR]>, VEX, VEX_L; + +let Predicates = [HasAVX2, NoVLX] in { + defm : vextract_lowering<"VEXTRACTI128", v4i64, v2i64>; + defm : vextract_lowering<"VEXTRACTI128", v8i32, v4i32>; + defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>; + defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>; +} + +//===----------------------------------------------------------------------===// +// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores +// +multiclass avx2_pmovmask<string OpcodeStr, + Intrinsic IntLd128, Intrinsic IntLd256, + Intrinsic IntSt128, Intrinsic IntSt256> { + def rm : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>, + VEX_4V, Sched<[WriteVecMaskedLoad]>; + def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, i256mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, + VEX_4V, VEX_L, Sched<[WriteVecMaskedLoadY]>; + def mr : AVX28I<0x8e, MRMDestMem, (outs), + (ins i128mem:$dst, VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>, + VEX_4V, Sched<[WriteVecMaskedStore]>; + def Ymr : AVX28I<0x8e, MRMDestMem, (outs), + (ins i256mem:$dst, VR256:$src1, VR256:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, + VEX_4V, VEX_L, Sched<[WriteVecMaskedStoreY]>; +} + +defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd", + int_x86_avx2_maskload_d, + int_x86_avx2_maskload_d_256, + int_x86_avx2_maskstore_d, + int_x86_avx2_maskstore_d_256>; +defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq", + int_x86_avx2_maskload_q, + int_x86_avx2_maskload_q_256, + int_x86_avx2_maskstore_q, + int_x86_avx2_maskstore_q_256>, VEX_W; + +multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT, + ValueType MaskVT, string BlendStr, ValueType ZeroVT> { + // masked store + def: Pat<(X86mstore addr:$ptr, (MaskVT RC:$mask), (VT RC:$src)), + (!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>; + // masked load + def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), undef)), + (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>; + def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), + (VT (bitconvert (ZeroVT immAllZerosV))))), + (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>; + def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), (VT RC:$src0))), + (!cast<Instruction>(BlendStr#"rr") + RC:$src0, + (VT (!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)), + RC:$mask)>; +} +let Predicates = [HasAVX] in { + defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32, "VBLENDVPS", v4i32>; + defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64, "VBLENDVPD", v4i32>; + defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32, "VBLENDVPSY", v8i32>; + defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64, "VBLENDVPDY", v8i32>; +} +let Predicates = [HasAVX1Only] in { + // load/store i32/i64 not supported use ps/pd version + defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>; + defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>; + defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>; + defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>; +} +let Predicates = [HasAVX2] in { + defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>; + defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>; + defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>; + defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>; +} + +//===----------------------------------------------------------------------===// +// SubVector Broadcasts +// Provide fallback in case the load node that is used in the patterns above +// is used by additional users, which prevents the pattern selection. + +let Predicates = [HasAVX2, NoVLX] in { +def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))), + (VINSERTI128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm), + (v2i64 VR128:$src), 1)>; +def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))), + (VINSERTI128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm), + (v4i32 VR128:$src), 1)>; +def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))), + (VINSERTI128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm), + (v8i16 VR128:$src), 1)>; +def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))), + (VINSERTI128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm), + (v16i8 VR128:$src), 1)>; +} + +let Predicates = [HasAVX, NoVLX] in { +def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128:$src))), + (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm), + (v2f64 VR128:$src), 1)>; +def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128:$src))), + (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm), + (v4f32 VR128:$src), 1)>; +} + +let Predicates = [HasAVX1Only] in { +def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))), + (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm), + (v2i64 VR128:$src), 1)>; +def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))), + (VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm), + (v4i32 VR128:$src), 1)>; +def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))), + (VINSERTF128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm), + (v8i16 VR128:$src), 1)>; +def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))), + (VINSERTF128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm), + (v16i8 VR128:$src), 1)>; +} + +//===----------------------------------------------------------------------===// +// Variable Bit Shifts +// +multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType vt128, ValueType vt256> { + def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>, + VEX_4V, Sched<[SchedWriteVarVecShift.XMM]>; + def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (vt128 (OpNode VR128:$src1, + (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>, + VEX_4V, Sched<[SchedWriteVarVecShift.XMM.Folded, ReadAfterLd]>; + def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>, + VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM]>; + def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, i256mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, + (vt256 (OpNode VR256:$src1, + (vt256 (bitconvert (loadv4i64 addr:$src2))))))]>, + VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded, ReadAfterLd]>; +} + +let Predicates = [HasAVX2, NoVLX] in { + defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>; + defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W; + defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>; + defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W; + defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>; + + def : Pat<(v4i32 (X86vsrav VR128:$src1, VR128:$src2)), + (VPSRAVDrr VR128:$src1, VR128:$src2)>; + def : Pat<(v4i32 (X86vsrav VR128:$src1, + (bitconvert (loadv2i64 addr:$src2)))), + (VPSRAVDrm VR128:$src1, addr:$src2)>; + def : Pat<(v8i32 (X86vsrav VR256:$src1, VR256:$src2)), + (VPSRAVDYrr VR256:$src1, VR256:$src2)>; + def : Pat<(v8i32 (X86vsrav VR256:$src1, + (bitconvert (loadv4i64 addr:$src2)))), + (VPSRAVDYrm VR256:$src1, addr:$src2)>; +} + +//===----------------------------------------------------------------------===// +// VGATHER - GATHER Operations + +// FIXME: Improve scheduling of gather instructions. +multiclass avx2_gather<bits<8> opc, string OpcodeStr, ValueType VTx, + ValueType VTy, PatFrag GatherNode128, + PatFrag GatherNode256, RegisterClass RC256, + X86MemOperand memop128, X86MemOperand memop256, + ValueType MTx = VTx, ValueType MTy = VTy> { + def rm : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb), + (ins VR128:$src1, memop128:$src2, VR128:$mask), + !strconcat(OpcodeStr, + "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), + [(set (VTx VR128:$dst), (MTx VR128:$mask_wb), + (GatherNode128 VR128:$src1, VR128:$mask, + vectoraddr:$src2))]>, + VEX, Sched<[WriteLoad]>; + def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb), + (ins RC256:$src1, memop256:$src2, RC256:$mask), + !strconcat(OpcodeStr, + "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), + [(set (VTy RC256:$dst), (MTy RC256:$mask_wb), + (GatherNode256 RC256:$src1, RC256:$mask, + vectoraddr:$src2))]>, + VEX, VEX_L, Sched<[WriteLoad]>; +} + +let Predicates = [UseAVX2] in { + let mayLoad = 1, hasSideEffects = 0, Constraints + = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb" + in { + defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", v2i64, v4i64, mgatherv4i32, + mgatherv4i32, VR256, vx128mem, vx256mem>, VEX_W; + defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", v2i64, v4i64, mgatherv2i64, + mgatherv4i64, VR256, vx128mem, vy256mem>, VEX_W; + defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", v4i32, v8i32, mgatherv4i32, + mgatherv8i32, VR256, vx128mem, vy256mem>; + defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", v4i32, v4i32, mgatherv2i64, + mgatherv4i64, VR128, vx64mem, vy128mem>; + + let ExeDomain = SSEPackedDouble in { + defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", v2f64, v4f64, mgatherv4i32, + mgatherv4i32, VR256, vx128mem, vx256mem, + v2i64, v4i64>, VEX_W; + defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", v2f64, v4f64, mgatherv2i64, + mgatherv4i64, VR256, vx128mem, vy256mem, + v2i64, v4i64>, VEX_W; + } + + let ExeDomain = SSEPackedSingle in { + defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", v4f32, v8f32, mgatherv4i32, + mgatherv8i32, VR256, vx128mem, vy256mem, + v4i32, v8i32>; + defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", v4f32, v4f32, mgatherv2i64, + mgatherv4i64, VR128, vx64mem, vy128mem, + v4i32, v4i32>; + } + } +} + +//===----------------------------------------------------------------------===// +// Extra selection patterns for f128, f128mem + +// movaps is shorter than movdqa. movaps is in SSE and movdqa is in SSE2. +def : Pat<(alignedstore (f128 VR128:$src), addr:$dst), + (MOVAPSmr addr:$dst, (COPY_TO_REGCLASS (f128 VR128:$src), VR128))>; +def : Pat<(store (f128 VR128:$src), addr:$dst), + (MOVUPSmr addr:$dst, (COPY_TO_REGCLASS (f128 VR128:$src), VR128))>; + +def : Pat<(alignedloadf128 addr:$src), + (COPY_TO_REGCLASS (MOVAPSrm addr:$src), VR128)>; +def : Pat<(loadf128 addr:$src), + (COPY_TO_REGCLASS (MOVUPSrm addr:$src), VR128)>; + +// andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2 +def : Pat<(f128 (X86fand VR128:$src1, (memopf128 addr:$src2))), + (COPY_TO_REGCLASS + (ANDPSrm (COPY_TO_REGCLASS VR128:$src1, VR128), f128mem:$src2), + VR128)>; + +def : Pat<(f128 (X86fand VR128:$src1, VR128:$src2)), + (COPY_TO_REGCLASS + (ANDPSrr (COPY_TO_REGCLASS VR128:$src1, VR128), + (COPY_TO_REGCLASS VR128:$src2, VR128)), VR128)>; + +def : Pat<(f128 (X86for VR128:$src1, (memopf128 addr:$src2))), + (COPY_TO_REGCLASS + (ORPSrm (COPY_TO_REGCLASS VR128:$src1, VR128), f128mem:$src2), + VR128)>; + +def : Pat<(f128 (X86for VR128:$src1, VR128:$src2)), + (COPY_TO_REGCLASS + (ORPSrr (COPY_TO_REGCLASS VR128:$src1, VR128), + (COPY_TO_REGCLASS VR128:$src2, VR128)), VR128)>; + +def : Pat<(f128 (X86fxor VR128:$src1, (memopf128 addr:$src2))), + (COPY_TO_REGCLASS + (XORPSrm (COPY_TO_REGCLASS VR128:$src1, VR128), f128mem:$src2), + VR128)>; + +def : Pat<(f128 (X86fxor VR128:$src1, VR128:$src2)), + (COPY_TO_REGCLASS + (XORPSrr (COPY_TO_REGCLASS VR128:$src1, VR128), + (COPY_TO_REGCLASS VR128:$src2, VR128)), VR128)>; + +//===----------------------------------------------------------------------===// +// GFNI instructions +//===----------------------------------------------------------------------===// + +multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT, + RegisterClass RC, PatFrag MemOpFrag, + X86MemOperand X86MemOp, bit Is2Addr = 0> { + let ExeDomain = SSEPackedInt, + AsmString = !if(Is2Addr, + OpcodeStr##"\t{$src2, $dst|$dst, $src2}", + OpcodeStr##"\t{$src2, $src1, $dst|$dst, $src1, $src2}") in { + let isCommutable = 1 in + def rr : PDI<0xCF, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), "", + [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, RC:$src2)))]>, + Sched<[SchedWriteVecALU.XMM]>, T8PD; + + def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "", + [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, + (bitconvert (MemOpFrag addr:$src2)))))]>, + Sched<[SchedWriteVecALU.XMM.Folded, ReadAfterLd]>, T8PD; + } +} + +multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT, + SDNode OpNode, RegisterClass RC, PatFrag MemOpFrag, + X86MemOperand X86MemOp, bit Is2Addr = 0> { + let AsmString = !if(Is2Addr, + OpStr##"\t{$src3, $src2, $dst|$dst, $src2, $src3}", + OpStr##"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in { + def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, u8imm:$src3), "", + [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))], + SSEPackedInt>, Sched<[SchedWriteVecALU.XMM]>; + def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "", + [(set RC:$dst, (OpVT (OpNode RC:$src1, + (bitconvert (MemOpFrag addr:$src2)), + imm:$src3)))], SSEPackedInt>, + Sched<[SchedWriteVecALU.XMM.Folded, ReadAfterLd]>; + } +} + +multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> { + let Constraints = "$src1 = $dst", + Predicates = [HasGFNI, UseSSE2] in + defm NAME : GF2P8AFFINE_rmi<Op, OpStr, v16i8, OpNode, + VR128, loadv2i64, i128mem, 1>; + let Predicates = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in { + defm V##NAME : GF2P8AFFINE_rmi<Op, "v"##OpStr, v16i8, OpNode, VR128, + loadv2i64, i128mem>, VEX_4V, VEX_W; + defm V##NAME##Y : GF2P8AFFINE_rmi<Op, "v"##OpStr, v32i8, OpNode, VR256, + loadv4i64, i256mem>, VEX_4V, VEX_L, VEX_W; + } +} + +// GF2P8MULB +let Constraints = "$src1 = $dst", + Predicates = [HasGFNI, UseSSE2] in +defm GF2P8MULB : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memopv2i64, + i128mem, 1>; +let Predicates = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in { + defm VGF2P8MULB : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, loadv2i64, + i128mem>, VEX_4V; + defm VGF2P8MULBY : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, loadv4i64, + i256mem>, VEX_4V, VEX_L; +} +// GF2P8AFFINEINVQB, GF2P8AFFINEQB +let isCommutable = 0 in { + defm GF2P8AFFINEINVQB : GF2P8AFFINE_common<0xCF, "gf2p8affineinvqb", + X86GF2P8affineinvqb>, TAPD; + defm GF2P8AFFINEQB : GF2P8AFFINE_common<0xCE, "gf2p8affineqb", + X86GF2P8affineqb>, TAPD; +} + diff --git a/capstone/suite/synctools/tablegen/X86/back/X86InstrSVM.td b/capstone/suite/synctools/tablegen/X86/back/X86InstrSVM.td new file mode 100644 index 000000000..2dc6e8b43 --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86InstrSVM.td @@ -0,0 +1,63 @@ +//===-- X86InstrSVM.td - SVM Instruction Set Extension -----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the instructions that make up the AMD SVM instruction +// set. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// SVM instructions + +let SchedRW = [WriteSystem] in { +// 0F 01 D9 +def VMMCALL : I<0x01, MRM_D9, (outs), (ins), "vmmcall", []>, TB; + +// 0F 01 DC +def STGI : I<0x01, MRM_DC, (outs), (ins), "stgi", []>, TB; + +// 0F 01 DD +def CLGI : I<0x01, MRM_DD, (outs), (ins), "clgi", []>, TB; + +// 0F 01 DE +let Uses = [EAX] in +def SKINIT : I<0x01, MRM_DE, (outs), (ins), "skinit\t{%eax|eax}", []>, TB; + +// 0F 01 D8 +let Uses = [EAX] in +def VMRUN32 : I<0x01, MRM_D8, (outs), (ins), "vmrun\t{%eax|eax}", []>, TB, + Requires<[Not64BitMode]>; +let Uses = [RAX] in +def VMRUN64 : I<0x01, MRM_D8, (outs), (ins), "vmrun\t{%rax|rax}", []>, TB, + Requires<[In64BitMode]>; + +// 0F 01 DA +let Uses = [EAX] in +def VMLOAD32 : I<0x01, MRM_DA, (outs), (ins), "vmload\t{%eax|eax}", []>, TB, + Requires<[Not64BitMode]>; +let Uses = [RAX] in +def VMLOAD64 : I<0x01, MRM_DA, (outs), (ins), "vmload\t{%rax|rax}", []>, TB, + Requires<[In64BitMode]>; + +// 0F 01 DB +let Uses = [EAX] in +def VMSAVE32 : I<0x01, MRM_DB, (outs), (ins), "vmsave\t{%eax|eax}", []>, TB, + Requires<[Not64BitMode]>; +let Uses = [RAX] in +def VMSAVE64 : I<0x01, MRM_DB, (outs), (ins), "vmsave\t{%rax|rax}", []>, TB, + Requires<[In64BitMode]>; + +// 0F 01 DF +let Uses = [EAX, ECX] in +def INVLPGA32 : I<0x01, MRM_DF, (outs), (ins), + "invlpga\t{%eax, %ecx|eax, ecx}", []>, TB, Requires<[Not64BitMode]>; +let Uses = [RAX, ECX] in +def INVLPGA64 : I<0x01, MRM_DF, (outs), (ins), + "invlpga\t{%rax, %ecx|rax, ecx}", []>, TB, Requires<[In64BitMode]>; +} // SchedRW diff --git a/capstone/suite/synctools/tablegen/X86/back/X86InstrShiftRotate.td b/capstone/suite/synctools/tablegen/X86/back/X86InstrShiftRotate.td new file mode 100644 index 000000000..cbcb1f7f7 --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86InstrShiftRotate.td @@ -0,0 +1,1031 @@ +//===-- X86InstrShiftRotate.td - Shift and Rotate Instrs ---*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the shift and rotate instructions. +// +//===----------------------------------------------------------------------===// + +// FIXME: Someone needs to smear multipattern goodness all over this file. + +let Defs = [EFLAGS] in { + +let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { +let Uses = [CL] in { +def SHL8rCL : I<0xD2, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1), + "shl{b}\t{%cl, $dst|$dst, cl}", + [(set GR8:$dst, (shl GR8:$src1, CL))]>; +def SHL16rCL : I<0xD3, MRM4r, (outs GR16:$dst), (ins GR16:$src1), + "shl{w}\t{%cl, $dst|$dst, cl}", + [(set GR16:$dst, (shl GR16:$src1, CL))]>, OpSize16; +def SHL32rCL : I<0xD3, MRM4r, (outs GR32:$dst), (ins GR32:$src1), + "shl{l}\t{%cl, $dst|$dst, cl}", + [(set GR32:$dst, (shl GR32:$src1, CL))]>, OpSize32; +def SHL64rCL : RI<0xD3, MRM4r, (outs GR64:$dst), (ins GR64:$src1), + "shl{q}\t{%cl, $dst|$dst, cl}", + [(set GR64:$dst, (shl GR64:$src1, CL))]>; +} // Uses = [CL] + +def SAL8rCL : I<0xD2, MRM6r, (outs GR8 :$dst), (ins GR8 :$src1), "sal{b}\t{%cl, $dst|$dst, cl}", []>; +def SAL16rCL : I<0xD3, MRM6r, (outs GR16:$dst), (ins GR16:$src1), + "sal{w}\t{%cl, $dst|$dst, cl}", + []>, OpSize16; +def SAL32rCL : I<0xD3, MRM6r, (outs GR32:$dst), (ins GR32:$src1), + "sal{l}\t{%cl, $dst|$dst, cl}", + []>, OpSize32; +def SAL64rCL : RI<0xD3, MRM6r, (outs GR64:$dst), (ins GR64:$src1), + "sal{q}\t{%cl, $dst|$dst, cl}", + []>; + +def SHL8ri : Ii8<0xC0, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2), + "shl{b}\t{$src2, $dst|$dst, $src2}", + [(set GR8:$dst, (shl GR8:$src1, (i8 imm:$src2)))]>; + +def SAL8ri : Ii8<0xC0, MRM6r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2), + "sal{b}\t{$src2, $dst|$dst, $src2}", + []>; + +let isConvertibleToThreeAddress = 1 in { // Can transform into LEA. +def SHL16ri : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2), + "shl{w}\t{$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (shl GR16:$src1, (i8 imm:$src2)))]>, + OpSize16; +def SHL32ri : Ii8<0xC1, MRM4r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2), + "shl{l}\t{$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (shl GR32:$src1, (i8 imm:$src2)))]>, + OpSize32; +def SHL64ri : RIi8<0xC1, MRM4r, (outs GR64:$dst), + (ins GR64:$src1, u8imm:$src2), + "shl{q}\t{$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (shl GR64:$src1, (i8 imm:$src2)))]>; + +def SAL16ri : Ii8<0xC1, MRM6r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2), + "sal{w}\t{$src2, $dst|$dst, $src2}", + []>, + OpSize16; +def SAL32ri : Ii8<0xC1, MRM6r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2), + "sal{l}\t{$src2, $dst|$dst, $src2}", + []>, + OpSize32; +def SAL64ri : RIi8<0xC1, MRM6r, (outs GR64:$dst), + (ins GR64:$src1, i8imm:$src2), + "sal{q}\t{$src2, $dst|$dst, $src2}", + []>; +} // isConvertibleToThreeAddress = 1 + +// NOTE: We don't include patterns for shifts of a register by one, because +// 'add reg,reg' is cheaper (and we have a Pat pattern for shift-by-one). +let hasSideEffects = 0 in { +def SHL8r1 : I<0xD0, MRM4r, (outs GR8:$dst), (ins GR8:$src1), + "shl{b}\t{$$1, $dst|$dst, 1}", []>; +def SHL16r1 : I<0xD1, MRM4r, (outs GR16:$dst), (ins GR16:$src1), + "shl{w}\t{$$1, $dst|$dst, 1}", []>, OpSize16; +def SHL32r1 : I<0xD1, MRM4r, (outs GR32:$dst), (ins GR32:$src1), + "shl{l}\t{$$1, $dst|$dst, 1}", []>, OpSize32; +def SHL64r1 : RI<0xD1, MRM4r, (outs GR64:$dst), (ins GR64:$src1), + "shl{q}\t{$$1, $dst|$dst, 1}", []>; +def SAL8r1 : I<0xD0, MRM6r, (outs GR8:$dst), (ins GR8:$src1), + "sal{b}\t{$$1, $dst|$dst, 1}", []>; +def SAL16r1 : I<0xD1, MRM6r, (outs GR16:$dst), (ins GR16:$src1), + "sal{w}\t{$$1, $dst|$dst, 1}", []>, OpSize16; +def SAL32r1 : I<0xD1, MRM6r, (outs GR32:$dst), (ins GR32:$src1), + "sal{l}\t{$$1, $dst|$dst, 1}", []>, OpSize32; +def SAL64r1 : RI<0xD1, MRM6r, (outs GR64:$dst), (ins GR64:$src1), + "sal{q}\t{$$1, $dst|$dst, 1}", []>; +} // hasSideEffects = 0 +} // Constraints = "$src = $dst", SchedRW + + +let SchedRW = [WriteShiftLd, WriteRMW] in { +// FIXME: Why do we need an explicit "Uses = [CL]" when the instr has a pattern +// using CL? +let Uses = [CL] in { +def SHL8mCL : I<0xD2, MRM4m, (outs), (ins i8mem :$dst), + "shl{b}\t{%cl, $dst|$dst, cl}", + [(store (shl (loadi8 addr:$dst), CL), addr:$dst)]>; +def SHL16mCL : I<0xD3, MRM4m, (outs), (ins i16mem:$dst), + "shl{w}\t{%cl, $dst|$dst, cl}", + [(store (shl (loadi16 addr:$dst), CL), addr:$dst)]>, + OpSize16; +def SHL32mCL : I<0xD3, MRM4m, (outs), (ins i32mem:$dst), + "shl{l}\t{%cl, $dst|$dst, cl}", + [(store (shl (loadi32 addr:$dst), CL), addr:$dst)]>, + OpSize32; +def SHL64mCL : RI<0xD3, MRM4m, (outs), (ins i64mem:$dst), + "shl{q}\t{%cl, $dst|$dst, cl}", + [(store (shl (loadi64 addr:$dst), CL), addr:$dst)]>, + Requires<[In64BitMode]>; +def SAL8mCL : I<0xD2, MRM6m, (outs), (ins i8mem :$dst), + "sal{b}\t{%cl, $dst|$dst, cl}", + []>; +def SAL16mCL : I<0xD3, MRM6m, (outs), (ins i16mem:$dst), + "sal{w}\t{%cl, $dst|$dst, cl}", + []>, + OpSize16; +def SAL32mCL : I<0xD3, MRM6m, (outs), (ins i32mem:$dst), + "sal{l}\t{%cl, $dst|$dst, cl}", + []>, + OpSize32; +def SAL64mCL : RI<0xD3, MRM6m, (outs), (ins i64mem:$dst), + "sal{q}\t{%cl, $dst|$dst, cl}", + []>; +} +def SHL8mi : Ii8<0xC0, MRM4m, (outs), (ins i8mem :$dst, u8imm:$src), + "shl{b}\t{$src, $dst|$dst, $src}", + [(store (shl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>; +def SHL16mi : Ii8<0xC1, MRM4m, (outs), (ins i16mem:$dst, u8imm:$src), + "shl{w}\t{$src, $dst|$dst, $src}", + [(store (shl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>, + OpSize16; +def SHL32mi : Ii8<0xC1, MRM4m, (outs), (ins i32mem:$dst, u8imm:$src), + "shl{l}\t{$src, $dst|$dst, $src}", + [(store (shl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>, + OpSize32; +def SHL64mi : RIi8<0xC1, MRM4m, (outs), (ins i64mem:$dst, u8imm:$src), + "shl{q}\t{$src, $dst|$dst, $src}", + [(store (shl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>, + Requires<[In64BitMode]>; +def SAL8mi : Ii8<0xC0, MRM6m, (outs), (ins i8mem :$dst, i8imm:$src), + "sal{b}\t{$src, $dst|$dst, $src}", + []>; +def SAL16mi : Ii8<0xC1, MRM6m, (outs), (ins i16mem:$dst, i8imm:$src), + "sal{w}\t{$src, $dst|$dst, $src}", + []>, OpSize16; +def SAL32mi : Ii8<0xC1, MRM6m, (outs), (ins i32mem:$dst, i8imm:$src), + "sal{l}\t{$src, $dst|$dst, $src}", + []>, OpSize32; +def SAL64mi : RIi8<0xC1, MRM6m, (outs), (ins i64mem:$dst, i8imm:$src), + "sal{q}\t{$src, $dst|$dst, $src}", + []>; + +// Shift by 1 +def SHL8m1 : I<0xD0, MRM4m, (outs), (ins i8mem :$dst), + "shl{b}\t{$dst|$dst, 1}", + [(store (shl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>; +def SHL16m1 : I<0xD1, MRM4m, (outs), (ins i16mem:$dst), + "shl{w}\t{$dst|$dst, 1}", + [(store (shl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>, + OpSize16; +def SHL32m1 : I<0xD1, MRM4m, (outs), (ins i32mem:$dst), + "shl{l}\t{$dst|$dst, 1}", + [(store (shl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>, + OpSize32; +def SHL64m1 : RI<0xD1, MRM4m, (outs), (ins i64mem:$dst), + "shl{q}\t{$dst|$dst, 1}", + [(store (shl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>, + Requires<[In64BitMode]>; +def SAL8m1 : I<0xD0, MRM6m, (outs), (ins i8mem :$dst), + "sal{b}\t{$dst|$dst, 1}", + []>; +def SAL16m1 : I<0xD1, MRM6m, (outs), (ins i16mem:$dst), + "sal{w}\t{$dst|$dst, 1}", + []>, OpSize16; +def SAL32m1 : I<0xD1, MRM6m, (outs), (ins i32mem:$dst), + "sal{l}\t{$dst|$dst, 1}", + []>, OpSize32; +def SAL64m1 : RI<0xD1, MRM6m, (outs), (ins i64mem:$dst), + "sal{q}\t{$dst|$dst, 1}", + []>; +} // SchedRW + +let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { +let Uses = [CL] in { +def SHR8rCL : I<0xD2, MRM5r, (outs GR8 :$dst), (ins GR8 :$src1), + "shr{b}\t{%cl, $dst|$dst, cl}", + [(set GR8:$dst, (srl GR8:$src1, CL))]>; +def SHR16rCL : I<0xD3, MRM5r, (outs GR16:$dst), (ins GR16:$src1), + "shr{w}\t{%cl, $dst|$dst, cl}", + [(set GR16:$dst, (srl GR16:$src1, CL))]>, OpSize16; +def SHR32rCL : I<0xD3, MRM5r, (outs GR32:$dst), (ins GR32:$src1), + "shr{l}\t{%cl, $dst|$dst, cl}", + [(set GR32:$dst, (srl GR32:$src1, CL))]>, OpSize32; +def SHR64rCL : RI<0xD3, MRM5r, (outs GR64:$dst), (ins GR64:$src1), + "shr{q}\t{%cl, $dst|$dst, cl}", + [(set GR64:$dst, (srl GR64:$src1, CL))]>; +} + +def SHR8ri : Ii8<0xC0, MRM5r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$src2), + "shr{b}\t{$src2, $dst|$dst, $src2}", + [(set GR8:$dst, (srl GR8:$src1, (i8 imm:$src2)))]>; +def SHR16ri : Ii8<0xC1, MRM5r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2), + "shr{w}\t{$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (srl GR16:$src1, (i8 imm:$src2)))]>, + OpSize16; +def SHR32ri : Ii8<0xC1, MRM5r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2), + "shr{l}\t{$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (srl GR32:$src1, (i8 imm:$src2)))]>, + OpSize32; +def SHR64ri : RIi8<0xC1, MRM5r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$src2), + "shr{q}\t{$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (srl GR64:$src1, (i8 imm:$src2)))]>; + +// Shift right by 1 +def SHR8r1 : I<0xD0, MRM5r, (outs GR8:$dst), (ins GR8:$src1), + "shr{b}\t{$$1, $dst|$dst, 1}", + [(set GR8:$dst, (srl GR8:$src1, (i8 1)))]>; +def SHR16r1 : I<0xD1, MRM5r, (outs GR16:$dst), (ins GR16:$src1), + "shr{w}\t{$$1, $dst|$dst, 1}", + [(set GR16:$dst, (srl GR16:$src1, (i8 1)))]>, OpSize16; +def SHR32r1 : I<0xD1, MRM5r, (outs GR32:$dst), (ins GR32:$src1), + "shr{l}\t{$$1, $dst|$dst, 1}", + [(set GR32:$dst, (srl GR32:$src1, (i8 1)))]>, OpSize32; +def SHR64r1 : RI<0xD1, MRM5r, (outs GR64:$dst), (ins GR64:$src1), + "shr{q}\t{$$1, $dst|$dst, 1}", + [(set GR64:$dst, (srl GR64:$src1, (i8 1)))]>; +} // Constraints = "$src = $dst", SchedRW + + +let SchedRW = [WriteShiftLd, WriteRMW] in { +let Uses = [CL] in { +def SHR8mCL : I<0xD2, MRM5m, (outs), (ins i8mem :$dst), + "shr{b}\t{%cl, $dst|$dst, cl}", + [(store (srl (loadi8 addr:$dst), CL), addr:$dst)]>; +def SHR16mCL : I<0xD3, MRM5m, (outs), (ins i16mem:$dst), + "shr{w}\t{%cl, $dst|$dst, cl}", + [(store (srl (loadi16 addr:$dst), CL), addr:$dst)]>, + OpSize16; +def SHR32mCL : I<0xD3, MRM5m, (outs), (ins i32mem:$dst), + "shr{l}\t{%cl, $dst|$dst, cl}", + [(store (srl (loadi32 addr:$dst), CL), addr:$dst)]>, + OpSize32; +def SHR64mCL : RI<0xD3, MRM5m, (outs), (ins i64mem:$dst), + "shr{q}\t{%cl, $dst|$dst, cl}", + [(store (srl (loadi64 addr:$dst), CL), addr:$dst)]>, + Requires<[In64BitMode]>; +} +def SHR8mi : Ii8<0xC0, MRM5m, (outs), (ins i8mem :$dst, u8imm:$src), + "shr{b}\t{$src, $dst|$dst, $src}", + [(store (srl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>; +def SHR16mi : Ii8<0xC1, MRM5m, (outs), (ins i16mem:$dst, u8imm:$src), + "shr{w}\t{$src, $dst|$dst, $src}", + [(store (srl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>, + OpSize16; +def SHR32mi : Ii8<0xC1, MRM5m, (outs), (ins i32mem:$dst, u8imm:$src), + "shr{l}\t{$src, $dst|$dst, $src}", + [(store (srl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>, + OpSize32; +def SHR64mi : RIi8<0xC1, MRM5m, (outs), (ins i64mem:$dst, u8imm:$src), + "shr{q}\t{$src, $dst|$dst, $src}", + [(store (srl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>, + Requires<[In64BitMode]>; + +// Shift by 1 +def SHR8m1 : I<0xD0, MRM5m, (outs), (ins i8mem :$dst), + "shr{b}\t{$dst|$dst, 1}", + [(store (srl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>; +def SHR16m1 : I<0xD1, MRM5m, (outs), (ins i16mem:$dst), + "shr{w}\t{$dst|$dst, 1}", + [(store (srl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>, + OpSize16; +def SHR32m1 : I<0xD1, MRM5m, (outs), (ins i32mem:$dst), + "shr{l}\t{$dst|$dst, 1}", + [(store (srl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>, + OpSize32; +def SHR64m1 : RI<0xD1, MRM5m, (outs), (ins i64mem:$dst), + "shr{q}\t{$dst|$dst, 1}", + [(store (srl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>, + Requires<[In64BitMode]>; +} // SchedRW + +let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { +let Uses = [CL] in { +def SAR8rCL : I<0xD2, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1), + "sar{b}\t{%cl, $dst|$dst, cl}", + [(set GR8:$dst, (sra GR8:$src1, CL))]>; +def SAR16rCL : I<0xD3, MRM7r, (outs GR16:$dst), (ins GR16:$src1), + "sar{w}\t{%cl, $dst|$dst, cl}", + [(set GR16:$dst, (sra GR16:$src1, CL))]>, + OpSize16; +def SAR32rCL : I<0xD3, MRM7r, (outs GR32:$dst), (ins GR32:$src1), + "sar{l}\t{%cl, $dst|$dst, cl}", + [(set GR32:$dst, (sra GR32:$src1, CL))]>, + OpSize32; +def SAR64rCL : RI<0xD3, MRM7r, (outs GR64:$dst), (ins GR64:$src1), + "sar{q}\t{%cl, $dst|$dst, cl}", + [(set GR64:$dst, (sra GR64:$src1, CL))]>; +} + +def SAR8ri : Ii8<0xC0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2), + "sar{b}\t{$src2, $dst|$dst, $src2}", + [(set GR8:$dst, (sra GR8:$src1, (i8 imm:$src2)))]>; +def SAR16ri : Ii8<0xC1, MRM7r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2), + "sar{w}\t{$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (sra GR16:$src1, (i8 imm:$src2)))]>, + OpSize16; +def SAR32ri : Ii8<0xC1, MRM7r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2), + "sar{l}\t{$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (sra GR32:$src1, (i8 imm:$src2)))]>, + OpSize32; +def SAR64ri : RIi8<0xC1, MRM7r, (outs GR64:$dst), + (ins GR64:$src1, u8imm:$src2), + "sar{q}\t{$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (sra GR64:$src1, (i8 imm:$src2)))]>; + +// Shift by 1 +def SAR8r1 : I<0xD0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1), + "sar{b}\t{$$1, $dst|$dst, 1}", + [(set GR8:$dst, (sra GR8:$src1, (i8 1)))]>; +def SAR16r1 : I<0xD1, MRM7r, (outs GR16:$dst), (ins GR16:$src1), + "sar{w}\t{$$1, $dst|$dst, 1}", + [(set GR16:$dst, (sra GR16:$src1, (i8 1)))]>, OpSize16; +def SAR32r1 : I<0xD1, MRM7r, (outs GR32:$dst), (ins GR32:$src1), + "sar{l}\t{$$1, $dst|$dst, 1}", + [(set GR32:$dst, (sra GR32:$src1, (i8 1)))]>, OpSize32; +def SAR64r1 : RI<0xD1, MRM7r, (outs GR64:$dst), (ins GR64:$src1), + "sar{q}\t{$$1, $dst|$dst, 1}", + [(set GR64:$dst, (sra GR64:$src1, (i8 1)))]>; +} // Constraints = "$src = $dst", SchedRW + + +let SchedRW = [WriteShiftLd, WriteRMW] in { +let Uses = [CL] in { +def SAR8mCL : I<0xD2, MRM7m, (outs), (ins i8mem :$dst), + "sar{b}\t{%cl, $dst|$dst, cl}", + [(store (sra (loadi8 addr:$dst), CL), addr:$dst)]>; +def SAR16mCL : I<0xD3, MRM7m, (outs), (ins i16mem:$dst), + "sar{w}\t{%cl, $dst|$dst, cl}", + [(store (sra (loadi16 addr:$dst), CL), addr:$dst)]>, + OpSize16; +def SAR32mCL : I<0xD3, MRM7m, (outs), (ins i32mem:$dst), + "sar{l}\t{%cl, $dst|$dst, cl}", + [(store (sra (loadi32 addr:$dst), CL), addr:$dst)]>, + OpSize32; +def SAR64mCL : RI<0xD3, MRM7m, (outs), (ins i64mem:$dst), + "sar{q}\t{%cl, $dst|$dst, cl}", + [(store (sra (loadi64 addr:$dst), CL), addr:$dst)]>, + Requires<[In64BitMode]>; +} +def SAR8mi : Ii8<0xC0, MRM7m, (outs), (ins i8mem :$dst, u8imm:$src), + "sar{b}\t{$src, $dst|$dst, $src}", + [(store (sra (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>; +def SAR16mi : Ii8<0xC1, MRM7m, (outs), (ins i16mem:$dst, u8imm:$src), + "sar{w}\t{$src, $dst|$dst, $src}", + [(store (sra (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>, + OpSize16; +def SAR32mi : Ii8<0xC1, MRM7m, (outs), (ins i32mem:$dst, u8imm:$src), + "sar{l}\t{$src, $dst|$dst, $src}", + [(store (sra (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>, + OpSize32; +def SAR64mi : RIi8<0xC1, MRM7m, (outs), (ins i64mem:$dst, u8imm:$src), + "sar{q}\t{$src, $dst|$dst, $src}", + [(store (sra (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>, + Requires<[In64BitMode]>; + +// Shift by 1 +def SAR8m1 : I<0xD0, MRM7m, (outs), (ins i8mem :$dst), + "sar{b}\t{$dst|$dst, 1}", + [(store (sra (loadi8 addr:$dst), (i8 1)), addr:$dst)]>; +def SAR16m1 : I<0xD1, MRM7m, (outs), (ins i16mem:$dst), + "sar{w}\t{$dst|$dst, 1}", + [(store (sra (loadi16 addr:$dst), (i8 1)), addr:$dst)]>, + OpSize16; +def SAR32m1 : I<0xD1, MRM7m, (outs), (ins i32mem:$dst), + "sar{l}\t{$dst|$dst, 1}", + [(store (sra (loadi32 addr:$dst), (i8 1)), addr:$dst)]>, + OpSize32; +def SAR64m1 : RI<0xD1, MRM7m, (outs), (ins i64mem:$dst), + "sar{q}\t{$dst|$dst, 1}", + [(store (sra (loadi64 addr:$dst), (i8 1)), addr:$dst)]>, + Requires<[In64BitMode]>; +} // SchedRW + +//===----------------------------------------------------------------------===// +// Rotate instructions +//===----------------------------------------------------------------------===// + +let hasSideEffects = 0 in { +let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { + +let Uses = [CL, EFLAGS] in { +def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1), + "rcl{b}\t{%cl, $dst|$dst, cl}", []>; +def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src1), + "rcl{w}\t{%cl, $dst|$dst, cl}", []>, OpSize16; +def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src1), + "rcl{l}\t{%cl, $dst|$dst, cl}", []>, OpSize32; +def RCL64rCL : RI<0xD3, MRM2r, (outs GR64:$dst), (ins GR64:$src1), + "rcl{q}\t{%cl, $dst|$dst, cl}", []>; +} // Uses = [CL, EFLAGS] + +let Uses = [EFLAGS] in { +def RCL8r1 : I<0xD0, MRM2r, (outs GR8:$dst), (ins GR8:$src1), + "rcl{b}\t{$$1, $dst|$dst, 1}", []>; +def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$cnt), + "rcl{b}\t{$cnt, $dst|$dst, $cnt}", []>; +def RCL16r1 : I<0xD1, MRM2r, (outs GR16:$dst), (ins GR16:$src1), + "rcl{w}\t{$$1, $dst|$dst, 1}", []>, OpSize16; +def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$cnt), + "rcl{w}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize16; +def RCL32r1 : I<0xD1, MRM2r, (outs GR32:$dst), (ins GR32:$src1), + "rcl{l}\t{$$1, $dst|$dst, 1}", []>, OpSize32; +def RCL32ri : Ii8<0xC1, MRM2r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$cnt), + "rcl{l}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize32; +def RCL64r1 : RI<0xD1, MRM2r, (outs GR64:$dst), (ins GR64:$src1), + "rcl{q}\t{$$1, $dst|$dst, 1}", []>; +def RCL64ri : RIi8<0xC1, MRM2r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$cnt), + "rcl{q}\t{$cnt, $dst|$dst, $cnt}", []>; +} // Uses = [EFLAGS] + +let Uses = [CL, EFLAGS] in { +def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1), + "rcr{b}\t{%cl, $dst|$dst, cl}", []>; +def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src1), + "rcr{w}\t{%cl, $dst|$dst, cl}", []>, OpSize16; +def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1), + "rcr{l}\t{%cl, $dst|$dst, cl}", []>, OpSize32; +def RCR64rCL : RI<0xD3, MRM3r, (outs GR64:$dst), (ins GR64:$src1), + "rcr{q}\t{%cl, $dst|$dst, cl}", []>; +} // Uses = [CL, EFLAGS] + +let Uses = [EFLAGS] in { +def RCR8r1 : I<0xD0, MRM3r, (outs GR8:$dst), (ins GR8:$src1), + "rcr{b}\t{$$1, $dst|$dst, 1}", []>; +def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$cnt), + "rcr{b}\t{$cnt, $dst|$dst, $cnt}", []>; +def RCR16r1 : I<0xD1, MRM3r, (outs GR16:$dst), (ins GR16:$src1), + "rcr{w}\t{$$1, $dst|$dst, 1}", []>, OpSize16; +def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$cnt), + "rcr{w}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize16; +def RCR32r1 : I<0xD1, MRM3r, (outs GR32:$dst), (ins GR32:$src1), + "rcr{l}\t{$$1, $dst|$dst, 1}", []>, OpSize32; +def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$cnt), + "rcr{l}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize32; +def RCR64r1 : RI<0xD1, MRM3r, (outs GR64:$dst), (ins GR64:$src1), + "rcr{q}\t{$$1, $dst|$dst, 1}", []>; +def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$cnt), + "rcr{q}\t{$cnt, $dst|$dst, $cnt}", []>; +} // Uses = [EFLAGS] + +} // Constraints = "$src = $dst" + +let SchedRW = [WriteShiftLd, WriteRMW], mayStore = 1 in { +let Uses = [EFLAGS] in { +def RCL8m1 : I<0xD0, MRM2m, (outs), (ins i8mem:$dst), + "rcl{b}\t$dst", []>; +def RCL8mi : Ii8<0xC0, MRM2m, (outs), (ins i8mem:$dst, u8imm:$cnt), + "rcl{b}\t{$cnt, $dst|$dst, $cnt}", []>; +def RCL16m1 : I<0xD1, MRM2m, (outs), (ins i16mem:$dst), + "rcl{w}\t$dst", []>, OpSize16; +def RCL16mi : Ii8<0xC1, MRM2m, (outs), (ins i16mem:$dst, u8imm:$cnt), + "rcl{w}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize16; +def RCL32m1 : I<0xD1, MRM2m, (outs), (ins i32mem:$dst), + "rcl{l}\t$dst", []>, OpSize32; +def RCL32mi : Ii8<0xC1, MRM2m, (outs), (ins i32mem:$dst, u8imm:$cnt), + "rcl{l}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize32; +def RCL64m1 : RI<0xD1, MRM2m, (outs), (ins i64mem:$dst), + "rcl{q}\t$dst", []>, Requires<[In64BitMode]>; +def RCL64mi : RIi8<0xC1, MRM2m, (outs), (ins i64mem:$dst, u8imm:$cnt), + "rcl{q}\t{$cnt, $dst|$dst, $cnt}", []>, + Requires<[In64BitMode]>; + +def RCR8m1 : I<0xD0, MRM3m, (outs), (ins i8mem:$dst), + "rcr{b}\t$dst", []>; +def RCR8mi : Ii8<0xC0, MRM3m, (outs), (ins i8mem:$dst, u8imm:$cnt), + "rcr{b}\t{$cnt, $dst|$dst, $cnt}", []>; +def RCR16m1 : I<0xD1, MRM3m, (outs), (ins i16mem:$dst), + "rcr{w}\t$dst", []>, OpSize16; +def RCR16mi : Ii8<0xC1, MRM3m, (outs), (ins i16mem:$dst, u8imm:$cnt), + "rcr{w}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize16; +def RCR32m1 : I<0xD1, MRM3m, (outs), (ins i32mem:$dst), + "rcr{l}\t$dst", []>, OpSize32; +def RCR32mi : Ii8<0xC1, MRM3m, (outs), (ins i32mem:$dst, u8imm:$cnt), + "rcr{l}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize32; +def RCR64m1 : RI<0xD1, MRM3m, (outs), (ins i64mem:$dst), + "rcr{q}\t$dst", []>, Requires<[In64BitMode]>; +def RCR64mi : RIi8<0xC1, MRM3m, (outs), (ins i64mem:$dst, u8imm:$cnt), + "rcr{q}\t{$cnt, $dst|$dst, $cnt}", []>, + Requires<[In64BitMode]>; +} // Uses = [EFLAGS] + +let Uses = [CL, EFLAGS] in { +def RCL8mCL : I<0xD2, MRM2m, (outs), (ins i8mem:$dst), + "rcl{b}\t{%cl, $dst|$dst, cl}", []>; +def RCL16mCL : I<0xD3, MRM2m, (outs), (ins i16mem:$dst), + "rcl{w}\t{%cl, $dst|$dst, cl}", []>, OpSize16; +def RCL32mCL : I<0xD3, MRM2m, (outs), (ins i32mem:$dst), + "rcl{l}\t{%cl, $dst|$dst, cl}", []>, OpSize32; +def RCL64mCL : RI<0xD3, MRM2m, (outs), (ins i64mem:$dst), + "rcl{q}\t{%cl, $dst|$dst, cl}", []>, + Requires<[In64BitMode]>; + +def RCR8mCL : I<0xD2, MRM3m, (outs), (ins i8mem:$dst), + "rcr{b}\t{%cl, $dst|$dst, cl}", []>; +def RCR16mCL : I<0xD3, MRM3m, (outs), (ins i16mem:$dst), + "rcr{w}\t{%cl, $dst|$dst, cl}", []>, OpSize16; +def RCR32mCL : I<0xD3, MRM3m, (outs), (ins i32mem:$dst), + "rcr{l}\t{%cl, $dst|$dst, cl}", []>, OpSize32; +def RCR64mCL : RI<0xD3, MRM3m, (outs), (ins i64mem:$dst), + "rcr{q}\t{%cl, $dst|$dst, cl}", []>, + Requires<[In64BitMode]>; +} // Uses = [CL, EFLAGS] +} // SchedRW +} // hasSideEffects = 0 + +let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { +// FIXME: provide shorter instructions when imm8 == 1 +let Uses = [CL] in { +def ROL8rCL : I<0xD2, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1), + "rol{b}\t{%cl, $dst|$dst, cl}", + [(set GR8:$dst, (rotl GR8:$src1, CL))]>; +def ROL16rCL : I<0xD3, MRM0r, (outs GR16:$dst), (ins GR16:$src1), + "rol{w}\t{%cl, $dst|$dst, cl}", + [(set GR16:$dst, (rotl GR16:$src1, CL))]>, OpSize16; +def ROL32rCL : I<0xD3, MRM0r, (outs GR32:$dst), (ins GR32:$src1), + "rol{l}\t{%cl, $dst|$dst, cl}", + [(set GR32:$dst, (rotl GR32:$src1, CL))]>, OpSize32; +def ROL64rCL : RI<0xD3, MRM0r, (outs GR64:$dst), (ins GR64:$src1), + "rol{q}\t{%cl, $dst|$dst, cl}", + [(set GR64:$dst, (rotl GR64:$src1, CL))]>; +} + +def ROL8ri : Ii8<0xC0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2), + "rol{b}\t{$src2, $dst|$dst, $src2}", + [(set GR8:$dst, (rotl GR8:$src1, (i8 imm:$src2)))]>; +def ROL16ri : Ii8<0xC1, MRM0r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2), + "rol{w}\t{$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (rotl GR16:$src1, (i8 imm:$src2)))]>, OpSize16; +def ROL32ri : Ii8<0xC1, MRM0r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2), + "rol{l}\t{$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))]>, OpSize32; +def ROL64ri : RIi8<0xC1, MRM0r, (outs GR64:$dst), + (ins GR64:$src1, u8imm:$src2), + "rol{q}\t{$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))]>; + +// Rotate by 1 +def ROL8r1 : I<0xD0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1), + "rol{b}\t{$$1, $dst|$dst, 1}", + [(set GR8:$dst, (rotl GR8:$src1, (i8 1)))]>; +def ROL16r1 : I<0xD1, MRM0r, (outs GR16:$dst), (ins GR16:$src1), + "rol{w}\t{$$1, $dst|$dst, 1}", + [(set GR16:$dst, (rotl GR16:$src1, (i8 1)))]>, OpSize16; +def ROL32r1 : I<0xD1, MRM0r, (outs GR32:$dst), (ins GR32:$src1), + "rol{l}\t{$$1, $dst|$dst, 1}", + [(set GR32:$dst, (rotl GR32:$src1, (i8 1)))]>, OpSize32; +def ROL64r1 : RI<0xD1, MRM0r, (outs GR64:$dst), (ins GR64:$src1), + "rol{q}\t{$$1, $dst|$dst, 1}", + [(set GR64:$dst, (rotl GR64:$src1, (i8 1)))]>; +} // Constraints = "$src = $dst", SchedRW + +let SchedRW = [WriteShiftLd, WriteRMW] in { +let Uses = [CL] in { +def ROL8mCL : I<0xD2, MRM0m, (outs), (ins i8mem :$dst), + "rol{b}\t{%cl, $dst|$dst, cl}", + [(store (rotl (loadi8 addr:$dst), CL), addr:$dst)]>; +def ROL16mCL : I<0xD3, MRM0m, (outs), (ins i16mem:$dst), + "rol{w}\t{%cl, $dst|$dst, cl}", + [(store (rotl (loadi16 addr:$dst), CL), addr:$dst)]>, OpSize16; +def ROL32mCL : I<0xD3, MRM0m, (outs), (ins i32mem:$dst), + "rol{l}\t{%cl, $dst|$dst, cl}", + [(store (rotl (loadi32 addr:$dst), CL), addr:$dst)]>, OpSize32; +def ROL64mCL : RI<0xD3, MRM0m, (outs), (ins i64mem:$dst), + "rol{q}\t{%cl, $dst|$dst, cl}", + [(store (rotl (loadi64 addr:$dst), CL), addr:$dst)]>, + Requires<[In64BitMode]>; +} +def ROL8mi : Ii8<0xC0, MRM0m, (outs), (ins i8mem :$dst, u8imm:$src1), + "rol{b}\t{$src1, $dst|$dst, $src1}", + [(store (rotl (loadi8 addr:$dst), (i8 imm:$src1)), addr:$dst)]>; +def ROL16mi : Ii8<0xC1, MRM0m, (outs), (ins i16mem:$dst, u8imm:$src1), + "rol{w}\t{$src1, $dst|$dst, $src1}", + [(store (rotl (loadi16 addr:$dst), (i8 imm:$src1)), addr:$dst)]>, + OpSize16; +def ROL32mi : Ii8<0xC1, MRM0m, (outs), (ins i32mem:$dst, u8imm:$src1), + "rol{l}\t{$src1, $dst|$dst, $src1}", + [(store (rotl (loadi32 addr:$dst), (i8 imm:$src1)), addr:$dst)]>, + OpSize32; +def ROL64mi : RIi8<0xC1, MRM0m, (outs), (ins i64mem:$dst, u8imm:$src1), + "rol{q}\t{$src1, $dst|$dst, $src1}", + [(store (rotl (loadi64 addr:$dst), (i8 imm:$src1)), addr:$dst)]>, + Requires<[In64BitMode]>; + +// Rotate by 1 +def ROL8m1 : I<0xD0, MRM0m, (outs), (ins i8mem :$dst), + "rol{b}\t{$dst|$dst, 1}", + [(store (rotl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>; +def ROL16m1 : I<0xD1, MRM0m, (outs), (ins i16mem:$dst), + "rol{w}\t{$dst|$dst, 1}", + [(store (rotl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>, + OpSize16; +def ROL32m1 : I<0xD1, MRM0m, (outs), (ins i32mem:$dst), + "rol{l}\t{$dst|$dst, 1}", + [(store (rotl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>, + OpSize32; +def ROL64m1 : RI<0xD1, MRM0m, (outs), (ins i64mem:$dst), + "rol{q}\t{$dst|$dst, 1}", + [(store (rotl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>, + Requires<[In64BitMode]>; +} // SchedRW + +let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { +let Uses = [CL] in { +def ROR8rCL : I<0xD2, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1), + "ror{b}\t{%cl, $dst|$dst, cl}", + [(set GR8:$dst, (rotr GR8:$src1, CL))]>; +def ROR16rCL : I<0xD3, MRM1r, (outs GR16:$dst), (ins GR16:$src1), + "ror{w}\t{%cl, $dst|$dst, cl}", + [(set GR16:$dst, (rotr GR16:$src1, CL))]>, OpSize16; +def ROR32rCL : I<0xD3, MRM1r, (outs GR32:$dst), (ins GR32:$src1), + "ror{l}\t{%cl, $dst|$dst, cl}", + [(set GR32:$dst, (rotr GR32:$src1, CL))]>, OpSize32; +def ROR64rCL : RI<0xD3, MRM1r, (outs GR64:$dst), (ins GR64:$src1), + "ror{q}\t{%cl, $dst|$dst, cl}", + [(set GR64:$dst, (rotr GR64:$src1, CL))]>; +} + +def ROR8ri : Ii8<0xC0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2), + "ror{b}\t{$src2, $dst|$dst, $src2}", + [(set GR8:$dst, (rotr GR8:$src1, (i8 relocImm:$src2)))]>; +def ROR16ri : Ii8<0xC1, MRM1r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2), + "ror{w}\t{$src2, $dst|$dst, $src2}", + [(set GR16:$dst, (rotr GR16:$src1, (i8 relocImm:$src2)))]>, + OpSize16; +def ROR32ri : Ii8<0xC1, MRM1r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2), + "ror{l}\t{$src2, $dst|$dst, $src2}", + [(set GR32:$dst, (rotr GR32:$src1, (i8 relocImm:$src2)))]>, + OpSize32; +def ROR64ri : RIi8<0xC1, MRM1r, (outs GR64:$dst), + (ins GR64:$src1, u8imm:$src2), + "ror{q}\t{$src2, $dst|$dst, $src2}", + [(set GR64:$dst, (rotr GR64:$src1, (i8 relocImm:$src2)))]>; + +// Rotate by 1 +def ROR8r1 : I<0xD0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1), + "ror{b}\t{$$1, $dst|$dst, 1}", + [(set GR8:$dst, (rotl GR8:$src1, (i8 7)))]>; +def ROR16r1 : I<0xD1, MRM1r, (outs GR16:$dst), (ins GR16:$src1), + "ror{w}\t{$$1, $dst|$dst, 1}", + [(set GR16:$dst, (rotl GR16:$src1, (i8 15)))]>, OpSize16; +def ROR32r1 : I<0xD1, MRM1r, (outs GR32:$dst), (ins GR32:$src1), + "ror{l}\t{$$1, $dst|$dst, 1}", + [(set GR32:$dst, (rotl GR32:$src1, (i8 31)))]>, OpSize32; +def ROR64r1 : RI<0xD1, MRM1r, (outs GR64:$dst), (ins GR64:$src1), + "ror{q}\t{$$1, $dst|$dst, 1}", + [(set GR64:$dst, (rotl GR64:$src1, (i8 63)))]>; +} // Constraints = "$src = $dst", SchedRW + +let SchedRW = [WriteShiftLd, WriteRMW] in { +let Uses = [CL] in { +def ROR8mCL : I<0xD2, MRM1m, (outs), (ins i8mem :$dst), + "ror{b}\t{%cl, $dst|$dst, cl}", + [(store (rotr (loadi8 addr:$dst), CL), addr:$dst)]>; +def ROR16mCL : I<0xD3, MRM1m, (outs), (ins i16mem:$dst), + "ror{w}\t{%cl, $dst|$dst, cl}", + [(store (rotr (loadi16 addr:$dst), CL), addr:$dst)]>, OpSize16; +def ROR32mCL : I<0xD3, MRM1m, (outs), (ins i32mem:$dst), + "ror{l}\t{%cl, $dst|$dst, cl}", + [(store (rotr (loadi32 addr:$dst), CL), addr:$dst)]>, OpSize32; +def ROR64mCL : RI<0xD3, MRM1m, (outs), (ins i64mem:$dst), + "ror{q}\t{%cl, $dst|$dst, cl}", + [(store (rotr (loadi64 addr:$dst), CL), addr:$dst)]>, + Requires<[In64BitMode]>; +} +def ROR8mi : Ii8<0xC0, MRM1m, (outs), (ins i8mem :$dst, u8imm:$src), + "ror{b}\t{$src, $dst|$dst, $src}", + [(store (rotr (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>; +def ROR16mi : Ii8<0xC1, MRM1m, (outs), (ins i16mem:$dst, u8imm:$src), + "ror{w}\t{$src, $dst|$dst, $src}", + [(store (rotr (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>, + OpSize16; +def ROR32mi : Ii8<0xC1, MRM1m, (outs), (ins i32mem:$dst, u8imm:$src), + "ror{l}\t{$src, $dst|$dst, $src}", + [(store (rotr (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>, + OpSize32; +def ROR64mi : RIi8<0xC1, MRM1m, (outs), (ins i64mem:$dst, u8imm:$src), + "ror{q}\t{$src, $dst|$dst, $src}", + [(store (rotr (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>, + Requires<[In64BitMode]>; + +// Rotate by 1 +def ROR8m1 : I<0xD0, MRM1m, (outs), (ins i8mem :$dst), + "ror{b}\t{$dst|$dst, 1}", + [(store (rotl (loadi8 addr:$dst), (i8 7)), addr:$dst)]>; +def ROR16m1 : I<0xD1, MRM1m, (outs), (ins i16mem:$dst), + "ror{w}\t{$dst|$dst, 1}", + [(store (rotl (loadi16 addr:$dst), (i8 15)), addr:$dst)]>, + OpSize16; +def ROR32m1 : I<0xD1, MRM1m, (outs), (ins i32mem:$dst), + "ror{l}\t{$dst|$dst, 1}", + [(store (rotl (loadi32 addr:$dst), (i8 31)), addr:$dst)]>, + OpSize32; +def ROR64m1 : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst), + "ror{q}\t{$dst|$dst, 1}", + [(store (rotl (loadi64 addr:$dst), (i8 63)), addr:$dst)]>, + Requires<[In64BitMode]>; +} // SchedRW + + +//===----------------------------------------------------------------------===// +// Double shift instructions (generalizations of rotate) +//===----------------------------------------------------------------------===// + +let Constraints = "$src1 = $dst" in { + +let Uses = [CL], SchedRW = [WriteSHDrrcl] in { +def SHLD16rrCL : I<0xA5, MRMDestReg, (outs GR16:$dst), + (ins GR16:$src1, GR16:$src2), + "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}", + [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, CL))]>, + TB, OpSize16; +def SHRD16rrCL : I<0xAD, MRMDestReg, (outs GR16:$dst), + (ins GR16:$src1, GR16:$src2), + "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}", + [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, CL))]>, + TB, OpSize16; +def SHLD32rrCL : I<0xA5, MRMDestReg, (outs GR32:$dst), + (ins GR32:$src1, GR32:$src2), + "shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}", + [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, CL))]>, + TB, OpSize32; +def SHRD32rrCL : I<0xAD, MRMDestReg, (outs GR32:$dst), + (ins GR32:$src1, GR32:$src2), + "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}", + [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, CL))]>, + TB, OpSize32; +def SHLD64rrCL : RI<0xA5, MRMDestReg, (outs GR64:$dst), + (ins GR64:$src1, GR64:$src2), + "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}", + [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, CL))]>, + TB; +def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst), + (ins GR64:$src1, GR64:$src2), + "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}", + [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, CL))]>, + TB; +} // SchedRW + +let isCommutable = 1, SchedRW = [WriteSHDrri] in { // These instructions commute to each other. +def SHLD16rri8 : Ii8<0xA4, MRMDestReg, + (outs GR16:$dst), + (ins GR16:$src1, GR16:$src2, u8imm:$src3), + "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, + (i8 imm:$src3)))]>, + TB, OpSize16; +def SHRD16rri8 : Ii8<0xAC, MRMDestReg, + (outs GR16:$dst), + (ins GR16:$src1, GR16:$src2, u8imm:$src3), + "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, + (i8 imm:$src3)))]>, + TB, OpSize16; +def SHLD32rri8 : Ii8<0xA4, MRMDestReg, + (outs GR32:$dst), + (ins GR32:$src1, GR32:$src2, u8imm:$src3), + "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, + (i8 imm:$src3)))]>, + TB, OpSize32; +def SHRD32rri8 : Ii8<0xAC, MRMDestReg, + (outs GR32:$dst), + (ins GR32:$src1, GR32:$src2, u8imm:$src3), + "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, + (i8 imm:$src3)))]>, + TB, OpSize32; +def SHLD64rri8 : RIi8<0xA4, MRMDestReg, + (outs GR64:$dst), + (ins GR64:$src1, GR64:$src2, u8imm:$src3), + "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, + (i8 imm:$src3)))]>, + TB; +def SHRD64rri8 : RIi8<0xAC, MRMDestReg, + (outs GR64:$dst), + (ins GR64:$src1, GR64:$src2, u8imm:$src3), + "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, + (i8 imm:$src3)))]>, + TB; +} // SchedRW +} // Constraints = "$src = $dst" + +let Uses = [CL], SchedRW = [WriteSHDmrcl] in { +def SHLD16mrCL : I<0xA5, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2), + "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}", + [(store (X86shld (loadi16 addr:$dst), GR16:$src2, CL), + addr:$dst)]>, TB, OpSize16; +def SHRD16mrCL : I<0xAD, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2), + "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}", + [(store (X86shrd (loadi16 addr:$dst), GR16:$src2, CL), + addr:$dst)]>, TB, OpSize16; + +def SHLD32mrCL : I<0xA5, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), + "shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}", + [(store (X86shld (loadi32 addr:$dst), GR32:$src2, CL), + addr:$dst)]>, TB, OpSize32; +def SHRD32mrCL : I<0xAD, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), + "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}", + [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, CL), + addr:$dst)]>, TB, OpSize32; + +def SHLD64mrCL : RI<0xA5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), + "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}", + [(store (X86shld (loadi64 addr:$dst), GR64:$src2, CL), + addr:$dst)]>, TB; +def SHRD64mrCL : RI<0xAD, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), + "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}", + [(store (X86shrd (loadi64 addr:$dst), GR64:$src2, CL), + addr:$dst)]>, TB; +} // SchedRW + +let SchedRW = [WriteSHDmri] in { +def SHLD16mri8 : Ii8<0xA4, MRMDestMem, + (outs), (ins i16mem:$dst, GR16:$src2, u8imm:$src3), + "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(store (X86shld (loadi16 addr:$dst), GR16:$src2, + (i8 imm:$src3)), addr:$dst)]>, + TB, OpSize16; +def SHRD16mri8 : Ii8<0xAC, MRMDestMem, + (outs), (ins i16mem:$dst, GR16:$src2, u8imm:$src3), + "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(store (X86shrd (loadi16 addr:$dst), GR16:$src2, + (i8 imm:$src3)), addr:$dst)]>, + TB, OpSize16; + +def SHLD32mri8 : Ii8<0xA4, MRMDestMem, + (outs), (ins i32mem:$dst, GR32:$src2, u8imm:$src3), + "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(store (X86shld (loadi32 addr:$dst), GR32:$src2, + (i8 imm:$src3)), addr:$dst)]>, + TB, OpSize32; +def SHRD32mri8 : Ii8<0xAC, MRMDestMem, + (outs), (ins i32mem:$dst, GR32:$src2, u8imm:$src3), + "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, + (i8 imm:$src3)), addr:$dst)]>, + TB, OpSize32; + +def SHLD64mri8 : RIi8<0xA4, MRMDestMem, + (outs), (ins i64mem:$dst, GR64:$src2, u8imm:$src3), + "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(store (X86shld (loadi64 addr:$dst), GR64:$src2, + (i8 imm:$src3)), addr:$dst)]>, + TB; +def SHRD64mri8 : RIi8<0xAC, MRMDestMem, + (outs), (ins i64mem:$dst, GR64:$src2, u8imm:$src3), + "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(store (X86shrd (loadi64 addr:$dst), GR64:$src2, + (i8 imm:$src3)), addr:$dst)]>, + TB; +} // SchedRW + +} // Defs = [EFLAGS] + +// Sandy Bridge and newer Intel processors support faster rotates using +// SHLD to avoid a partial flag update on the normal rotate instructions. +let Predicates = [HasFastSHLDRotate], AddedComplexity = 5 in { + def : Pat<(rotl GR32:$src, (i8 imm:$shamt)), + (SHLD32rri8 GR32:$src, GR32:$src, imm:$shamt)>; + def : Pat<(rotl GR64:$src, (i8 imm:$shamt)), + (SHLD64rri8 GR64:$src, GR64:$src, imm:$shamt)>; +} + +def ROT32L2R_imm8 : SDNodeXForm<imm, [{ + // Convert a ROTL shamt to a ROTR shamt on 32-bit integer. + return getI8Imm(32 - N->getZExtValue(), SDLoc(N)); +}]>; + +def ROT64L2R_imm8 : SDNodeXForm<imm, [{ + // Convert a ROTL shamt to a ROTR shamt on 64-bit integer. + return getI8Imm(64 - N->getZExtValue(), SDLoc(N)); +}]>; + +multiclass bmi_rotate<string asm, RegisterClass RC, X86MemOperand x86memop> { +let hasSideEffects = 0 in { + def ri : Ii8<0xF0, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, u8imm:$src2), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, TAXD, VEX, Sched<[WriteShift]>; + let mayLoad = 1 in + def mi : Ii8<0xF0, MRMSrcMem, (outs RC:$dst), + (ins x86memop:$src1, u8imm:$src2), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, TAXD, VEX, Sched<[WriteShiftLd]>; +} +} + +multiclass bmi_shift<string asm, RegisterClass RC, X86MemOperand x86memop> { +let hasSideEffects = 0 in { + def rr : I<0xF7, MRMSrcReg4VOp3, (outs RC:$dst), (ins RC:$src1, RC:$src2), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, + VEX, Sched<[WriteShift]>; + let mayLoad = 1 in + def rm : I<0xF7, MRMSrcMem4VOp3, + (outs RC:$dst), (ins x86memop:$src1, RC:$src2), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, + VEX, Sched<[WriteShiftLd, + // x86memop:$src1 + ReadDefault, ReadDefault, ReadDefault, ReadDefault, + ReadDefault, + // RC:$src2 + ReadAfterLd]>; +} +} + +let Predicates = [HasBMI2] in { + defm RORX32 : bmi_rotate<"rorx{l}", GR32, i32mem>; + defm RORX64 : bmi_rotate<"rorx{q}", GR64, i64mem>, VEX_W; + defm SARX32 : bmi_shift<"sarx{l}", GR32, i32mem>, T8XS; + defm SARX64 : bmi_shift<"sarx{q}", GR64, i64mem>, T8XS, VEX_W; + defm SHRX32 : bmi_shift<"shrx{l}", GR32, i32mem>, T8XD; + defm SHRX64 : bmi_shift<"shrx{q}", GR64, i64mem>, T8XD, VEX_W; + defm SHLX32 : bmi_shift<"shlx{l}", GR32, i32mem>, T8PD; + defm SHLX64 : bmi_shift<"shlx{q}", GR64, i64mem>, T8PD, VEX_W; + + // Prefer RORX which is non-destructive and doesn't update EFLAGS. + let AddedComplexity = 10 in { + def : Pat<(rotl GR32:$src, (i8 imm:$shamt)), + (RORX32ri GR32:$src, (ROT32L2R_imm8 imm:$shamt))>; + def : Pat<(rotl GR64:$src, (i8 imm:$shamt)), + (RORX64ri GR64:$src, (ROT64L2R_imm8 imm:$shamt))>; + } + + def : Pat<(rotl (loadi32 addr:$src), (i8 imm:$shamt)), + (RORX32mi addr:$src, (ROT32L2R_imm8 imm:$shamt))>; + def : Pat<(rotl (loadi64 addr:$src), (i8 imm:$shamt)), + (RORX64mi addr:$src, (ROT64L2R_imm8 imm:$shamt))>; + + // Prefer SARX/SHRX/SHLX over SAR/SHR/SHL with variable shift BUT not + // immedidate shift, i.e. the following code is considered better + // + // mov %edi, %esi + // shl $imm, %esi + // ... %edi, ... + // + // than + // + // movb $imm, %sil + // shlx %sil, %edi, %esi + // ... %edi, ... + // + let AddedComplexity = 1 in { + def : Pat<(sra GR32:$src1, GR8:$src2), + (SARX32rr GR32:$src1, + (INSERT_SUBREG + (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + def : Pat<(sra GR64:$src1, GR8:$src2), + (SARX64rr GR64:$src1, + (INSERT_SUBREG + (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + + def : Pat<(srl GR32:$src1, GR8:$src2), + (SHRX32rr GR32:$src1, + (INSERT_SUBREG + (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + def : Pat<(srl GR64:$src1, GR8:$src2), + (SHRX64rr GR64:$src1, + (INSERT_SUBREG + (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + + def : Pat<(shl GR32:$src1, GR8:$src2), + (SHLX32rr GR32:$src1, + (INSERT_SUBREG + (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + def : Pat<(shl GR64:$src1, GR8:$src2), + (SHLX64rr GR64:$src1, + (INSERT_SUBREG + (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + } + + // We prefer to use + // mov (%ecx), %esi + // shl $imm, $esi + // + // over + // + // movb $imm, %al + // shlx %al, (%ecx), %esi + // + // This priority is enforced by IsProfitableToFoldLoad. + def : Pat<(sra (loadi32 addr:$src1), GR8:$src2), + (SARX32rm addr:$src1, + (INSERT_SUBREG + (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + def : Pat<(sra (loadi64 addr:$src1), GR8:$src2), + (SARX64rm addr:$src1, + (INSERT_SUBREG + (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + + def : Pat<(srl (loadi32 addr:$src1), GR8:$src2), + (SHRX32rm addr:$src1, + (INSERT_SUBREG + (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + def : Pat<(srl (loadi64 addr:$src1), GR8:$src2), + (SHRX64rm addr:$src1, + (INSERT_SUBREG + (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + + def : Pat<(shl (loadi32 addr:$src1), GR8:$src2), + (SHLX32rm addr:$src1, + (INSERT_SUBREG + (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + def : Pat<(shl (loadi64 addr:$src1), GR8:$src2), + (SHLX64rm addr:$src1, + (INSERT_SUBREG + (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; +} diff --git a/capstone/suite/synctools/tablegen/X86/back/X86InstrSystem.td b/capstone/suite/synctools/tablegen/X86/back/X86InstrSystem.td new file mode 100644 index 000000000..e9dba76f4 --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86InstrSystem.td @@ -0,0 +1,743 @@ +//===-- X86InstrSystem.td - System Instructions ------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 instructions that are generally used in +// privileged modes. These are not typically used by the compiler, but are +// supported for the assembler and disassembler. +// +//===----------------------------------------------------------------------===// + +let SchedRW = [WriteSystem] in { +let Defs = [RAX, RDX] in + def RDTSC : I<0x31, RawFrm, (outs), (ins), "rdtsc", [(X86rdtsc)]>, TB; + +let Defs = [RAX, RCX, RDX] in + def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", [(X86rdtscp)]>, TB; + +// CPU flow control instructions + +let mayLoad = 1, mayStore = 0, hasSideEffects = 1, isTrap = 1 in { + def UD2 : I<0x0B, RawFrm, (outs), (ins), "ud2", [(trap)]>, TB; + def UD1 : I<0xB9, RawFrm, (outs), (ins), "ud1", []>, TB; + def UD0 : I<0xFF, RawFrm, (outs), (ins), "ud0", []>, TB; +} + +def HLT : I<0xF4, RawFrm, (outs), (ins), "hlt", []>; +def RSM : I<0xAA, RawFrm, (outs), (ins), "rsm", []>, TB; + +// Interrupt and SysCall Instructions. +let Uses = [EFLAGS] in + def INTO : I<0xce, RawFrm, (outs), (ins), "into", []>, Requires<[Not64BitMode]>; + +def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3", [(int_x86_int (i8 3))]>; +} // SchedRW + +// The long form of "int $3" turns into int3 as a size optimization. +// FIXME: This doesn't work because InstAlias can't match immediate constants. +//def : InstAlias<"int\t$3", (INT3)>; + +let SchedRW = [WriteSystem] in { + +def INT : Ii8<0xcd, RawFrm, (outs), (ins u8imm:$trap), "int\t$trap", + [(int_x86_int imm:$trap)]>; + + +def SYSCALL : I<0x05, RawFrm, (outs), (ins), "syscall", []>, TB; +def SYSRET : I<0x07, RawFrm, (outs), (ins), "sysret{l}", []>, TB; +def SYSRET64 :RI<0x07, RawFrm, (outs), (ins), "sysretq", []>, TB, + Requires<[In64BitMode]>; + +def SYSENTER : I<0x34, RawFrm, (outs), (ins), "sysenter", []>, TB; + +def SYSEXIT : I<0x35, RawFrm, (outs), (ins), "sysexit{l}", []>, TB; +def SYSEXIT64 :RI<0x35, RawFrm, (outs), (ins), "sysexitq", []>, TB, + Requires<[In64BitMode]>; +} // SchedRW + +def : Pat<(debugtrap), + (INT3)>, Requires<[NotPS4]>; +def : Pat<(debugtrap), + (INT (i8 0x41))>, Requires<[IsPS4]>; + +//===----------------------------------------------------------------------===// +// Input/Output Instructions. +// +let SchedRW = [WriteSystem] in { +let Defs = [AL], Uses = [DX] in +def IN8rr : I<0xEC, RawFrm, (outs), (ins), "in{b}\t{%dx, %al|al, dx}", []>; +let Defs = [AX], Uses = [DX] in +def IN16rr : I<0xED, RawFrm, (outs), (ins), "in{w}\t{%dx, %ax|ax, dx}", []>, + OpSize16; +let Defs = [EAX], Uses = [DX] in +def IN32rr : I<0xED, RawFrm, (outs), (ins), "in{l}\t{%dx, %eax|eax, dx}", []>, + OpSize32; + +let Defs = [AL] in +def IN8ri : Ii8<0xE4, RawFrm, (outs), (ins u8imm:$port), + "in{b}\t{$port, %al|al, $port}", []>; +let Defs = [AX] in +def IN16ri : Ii8<0xE5, RawFrm, (outs), (ins u8imm:$port), + "in{w}\t{$port, %ax|ax, $port}", []>, OpSize16; +let Defs = [EAX] in +def IN32ri : Ii8<0xE5, RawFrm, (outs), (ins u8imm:$port), + "in{l}\t{$port, %eax|eax, $port}", []>, OpSize32; + +let Uses = [DX, AL] in +def OUT8rr : I<0xEE, RawFrm, (outs), (ins), "out{b}\t{%al, %dx|dx, al}", []>; +let Uses = [DX, AX] in +def OUT16rr : I<0xEF, RawFrm, (outs), (ins), "out{w}\t{%ax, %dx|dx, ax}", []>, + OpSize16; +let Uses = [DX, EAX] in +def OUT32rr : I<0xEF, RawFrm, (outs), (ins), "out{l}\t{%eax, %dx|dx, eax}", []>, + OpSize32; + +let Uses = [AL] in +def OUT8ir : Ii8<0xE6, RawFrm, (outs), (ins u8imm:$port), + "out{b}\t{%al, $port|$port, al}", []>; +let Uses = [AX] in +def OUT16ir : Ii8<0xE7, RawFrm, (outs), (ins u8imm:$port), + "out{w}\t{%ax, $port|$port, ax}", []>, OpSize16; +let Uses = [EAX] in +def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins u8imm:$port), + "out{l}\t{%eax, $port|$port, eax}", []>, OpSize32; + +} // SchedRW + +//===----------------------------------------------------------------------===// +// Moves to and from debug registers + +let SchedRW = [WriteSystem] in { +def MOV32rd : I<0x21, MRMDestReg, (outs GR32:$dst), (ins DEBUG_REG:$src), + "mov{l}\t{$src, $dst|$dst, $src}", []>, TB, + Requires<[Not64BitMode]>; +def MOV64rd : I<0x21, MRMDestReg, (outs GR64:$dst), (ins DEBUG_REG:$src), + "mov{q}\t{$src, $dst|$dst, $src}", []>, TB, + Requires<[In64BitMode]>; + +def MOV32dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR32:$src), + "mov{l}\t{$src, $dst|$dst, $src}", []>, TB, + Requires<[Not64BitMode]>; +def MOV64dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR64:$src), + "mov{q}\t{$src, $dst|$dst, $src}", []>, TB, + Requires<[In64BitMode]>; +} // SchedRW + +//===----------------------------------------------------------------------===// +// Moves to and from control registers + +let SchedRW = [WriteSystem] in { +def MOV32rc : I<0x20, MRMDestReg, (outs GR32:$dst), (ins CONTROL_REG:$src), + "mov{l}\t{$src, $dst|$dst, $src}", []>, TB, + Requires<[Not64BitMode]>; +def MOV64rc : I<0x20, MRMDestReg, (outs GR64:$dst), (ins CONTROL_REG:$src), + "mov{q}\t{$src, $dst|$dst, $src}", []>, TB, + Requires<[In64BitMode]>; + +def MOV32cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR32:$src), + "mov{l}\t{$src, $dst|$dst, $src}", []>, TB, + Requires<[Not64BitMode]>; +def MOV64cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR64:$src), + "mov{q}\t{$src, $dst|$dst, $src}", []>, TB, + Requires<[In64BitMode]>; +} // SchedRW + +//===----------------------------------------------------------------------===// +// Segment override instruction prefixes + +//let SchedRW = [WriteNop] in { +//def CS_PREFIX : I<0x2E, RawFrm, (outs), (ins), "cs", []>; +//def SS_PREFIX : I<0x36, RawFrm, (outs), (ins), "ss", []>; +//def DS_PREFIX : I<0x3E, RawFrm, (outs), (ins), "ds", []>; +//def ES_PREFIX : I<0x26, RawFrm, (outs), (ins), "es", []>; +//def FS_PREFIX : I<0x64, RawFrm, (outs), (ins), "fs", []>; +//def GS_PREFIX : I<0x65, RawFrm, (outs), (ins), "gs", []>; +//} // SchedRW + +//===----------------------------------------------------------------------===// +// Moves to and from segment registers. +// + +let SchedRW = [WriteMove] in { +def MOV16rs : I<0x8C, MRMDestReg, (outs GR16:$dst), (ins SEGMENT_REG:$src), + "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize16; +def MOV32rs : I<0x8C, MRMDestReg, (outs GR32:$dst), (ins SEGMENT_REG:$src), + "mov{l}\t{$src, $dst|$dst, $src}", []>, OpSize32; +def MOV64rs : RI<0x8C, MRMDestReg, (outs GR64:$dst), (ins SEGMENT_REG:$src), + "mov{q}\t{$src, $dst|$dst, $src}", []>; +let mayStore = 1 in { +def MOV16ms : I<0x8C, MRMDestMem, (outs), (ins i16mem:$dst, SEGMENT_REG:$src), + "mov{w}\t{$src, $dst|$dst, $src}", []>; +} +def MOV16sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR16:$src), + "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize16; +def MOV32sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR32:$src), + "mov{l}\t{$src, $dst|$dst, $src}", []>, OpSize32; +def MOV64sr : RI<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR64:$src), + "mov{q}\t{$src, $dst|$dst, $src}", []>; +let mayLoad = 1 in { +def MOV16sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i16mem:$src), + "mov{w}\t{$src, $dst|$dst, $src}", []>; +} +} // SchedRW + +//===----------------------------------------------------------------------===// +// Segmentation support instructions. + +let SchedRW = [WriteSystem] in { +def SWAPGS : I<0x01, MRM_F8, (outs), (ins), "swapgs", []>, TB; + +let mayLoad = 1 in +def LAR16rm : I<0x02, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "lar{w}\t{$src, $dst|$dst, $src}", []>, TB, + OpSize16, NotMemoryFoldable; +def LAR16rr : I<0x02, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), + "lar{w}\t{$src, $dst|$dst, $src}", []>, TB, + OpSize16, NotMemoryFoldable; + +// i16mem operand in LAR32rm and GR32 operand in LAR32rr is not a typo. +let mayLoad = 1 in +def LAR32rm : I<0x02, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src), + "lar{l}\t{$src, $dst|$dst, $src}", []>, TB, + OpSize32, NotMemoryFoldable; +def LAR32rr : I<0x02, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "lar{l}\t{$src, $dst|$dst, $src}", []>, TB, + OpSize32, NotMemoryFoldable; +// i16mem operand in LAR64rm and GR32 operand in LAR64rr is not a typo. +let mayLoad = 1 in +def LAR64rm : RI<0x02, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), + "lar{q}\t{$src, $dst|$dst, $src}", []>, TB, NotMemoryFoldable; +def LAR64rr : RI<0x02, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src), + "lar{q}\t{$src, $dst|$dst, $src}", []>, TB, NotMemoryFoldable; + +// i16mem operand in LSL32rm and GR32 operand in LSL32rr is not a typo. +let mayLoad = 1 in +def LSL16rm : I<0x03, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), + "lsl{w}\t{$src, $dst|$dst, $src}", []>, TB, + OpSize16, NotMemoryFoldable; +def LSL16rr : I<0x03, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), + "lsl{w}\t{$src, $dst|$dst, $src}", []>, TB, + OpSize16, NotMemoryFoldable; +// i16mem operand in LSL64rm and GR32 operand in LSL64rr is not a typo. +let mayLoad = 1 in +def LSL32rm : I<0x03, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src), + "lsl{l}\t{$src, $dst|$dst, $src}", []>, TB, + OpSize32, NotMemoryFoldable; +def LSL32rr : I<0x03, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "lsl{l}\t{$src, $dst|$dst, $src}", []>, TB, + OpSize32, NotMemoryFoldable; +let mayLoad = 1 in +def LSL64rm : RI<0x03, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), + "lsl{q}\t{$src, $dst|$dst, $src}", []>, TB, NotMemoryFoldable; +def LSL64rr : RI<0x03, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src), + "lsl{q}\t{$src, $dst|$dst, $src}", []>, TB, NotMemoryFoldable; + +def INVLPG : I<0x01, MRM7m, (outs), (ins i8mem:$addr), "invlpg\t$addr", []>, TB; + +def STR16r : I<0x00, MRM1r, (outs GR16:$dst), (ins), + "str{w}\t$dst", []>, TB, OpSize16; +def STR32r : I<0x00, MRM1r, (outs GR32:$dst), (ins), + "str{l}\t$dst", []>, TB, OpSize32; +def STR64r : RI<0x00, MRM1r, (outs GR64:$dst), (ins), + "str{q}\t$dst", []>, TB; +let mayStore = 1 in +def STRm : I<0x00, MRM1m, (outs), (ins i16mem:$dst), "str{w}\t$dst", []>, TB; + +def LTRr : I<0x00, MRM3r, (outs), (ins GR16:$src), "ltr{w}\t$src", []>, TB, NotMemoryFoldable; +let mayLoad = 1 in +def LTRm : I<0x00, MRM3m, (outs), (ins i16mem:$src), "ltr{w}\t$src", []>, TB, NotMemoryFoldable; + +def PUSHCS16 : I<0x0E, RawFrm, (outs), (ins), "push{w}\t{%cs|cs}", []>, + OpSize16, Requires<[Not64BitMode]>; +def PUSHCS32 : I<0x0E, RawFrm, (outs), (ins), "push{l}\t{%cs|cs}", []>, + OpSize32, Requires<[Not64BitMode]>; +def PUSHSS16 : I<0x16, RawFrm, (outs), (ins), "push{w}\t{%ss|ss}", []>, + OpSize16, Requires<[Not64BitMode]>; +def PUSHSS32 : I<0x16, RawFrm, (outs), (ins), "push{l}\t{%ss|ss}", []>, + OpSize32, Requires<[Not64BitMode]>; +def PUSHDS16 : I<0x1E, RawFrm, (outs), (ins), "push{w}\t{%ds|ds}", []>, + OpSize16, Requires<[Not64BitMode]>; +def PUSHDS32 : I<0x1E, RawFrm, (outs), (ins), "push{l}\t{%ds|ds}", []>, + OpSize32, Requires<[Not64BitMode]>; +def PUSHES16 : I<0x06, RawFrm, (outs), (ins), "push{w}\t{%es|es}", []>, + OpSize16, Requires<[Not64BitMode]>; +def PUSHES32 : I<0x06, RawFrm, (outs), (ins), "push{l}\t{%es|es}", []>, + OpSize32, Requires<[Not64BitMode]>; +def PUSHFS16 : I<0xa0, RawFrm, (outs), (ins), "push{w}\t{%fs|fs}", []>, + OpSize16, TB; +def PUSHFS32 : I<0xa0, RawFrm, (outs), (ins), "push{l}\t{%fs|fs}", []>, TB, + OpSize32, Requires<[Not64BitMode]>; +def PUSHGS16 : I<0xa8, RawFrm, (outs), (ins), "push{w}\t{%gs|gs}", []>, + OpSize16, TB; +def PUSHGS32 : I<0xa8, RawFrm, (outs), (ins), "push{l}\t{%gs|gs}", []>, TB, + OpSize32, Requires<[Not64BitMode]>; +def PUSHFS64 : I<0xa0, RawFrm, (outs), (ins), "push{q}\t{%fs|fs}", []>, TB, + OpSize32, Requires<[In64BitMode]>; +def PUSHGS64 : I<0xa8, RawFrm, (outs), (ins), "push{q}\t{%gs|gs}", []>, TB, + OpSize32, Requires<[In64BitMode]>; + +// No "pop cs" instruction. +def POPSS16 : I<0x17, RawFrm, (outs), (ins), "pop{w}\t{%ss|ss}", []>, + OpSize16, Requires<[Not64BitMode]>; +def POPSS32 : I<0x17, RawFrm, (outs), (ins), "pop{l}\t{%ss|ss}", []>, + OpSize32, Requires<[Not64BitMode]>; + +def POPDS16 : I<0x1F, RawFrm, (outs), (ins), "pop{w}\t{%ds|ds}", []>, + OpSize16, Requires<[Not64BitMode]>; +def POPDS32 : I<0x1F, RawFrm, (outs), (ins), "pop{l}\t{%ds|ds}", []>, + OpSize32, Requires<[Not64BitMode]>; + +def POPES16 : I<0x07, RawFrm, (outs), (ins), "pop{w}\t{%es|es}", []>, + OpSize16, Requires<[Not64BitMode]>; +def POPES32 : I<0x07, RawFrm, (outs), (ins), "pop{l}\t{%es|es}", []>, + OpSize32, Requires<[Not64BitMode]>; + +def POPFS16 : I<0xa1, RawFrm, (outs), (ins), "pop{w}\t{%fs|fs}", []>, + OpSize16, TB; +def POPFS32 : I<0xa1, RawFrm, (outs), (ins), "pop{l}\t{%fs|fs}", []>, TB, + OpSize32, Requires<[Not64BitMode]>; +def POPFS64 : I<0xa1, RawFrm, (outs), (ins), "pop{q}\t{%fs|fs}", []>, TB, + OpSize32, Requires<[In64BitMode]>; + +def POPGS16 : I<0xa9, RawFrm, (outs), (ins), "pop{w}\t{%gs|gs}", []>, + OpSize16, TB; +def POPGS32 : I<0xa9, RawFrm, (outs), (ins), "pop{l}\t{%gs|gs}", []>, TB, + OpSize32, Requires<[Not64BitMode]>; +def POPGS64 : I<0xa9, RawFrm, (outs), (ins), "pop{q}\t{%gs|gs}", []>, TB, + OpSize32, Requires<[In64BitMode]>; + +def LDS16rm : I<0xc5, MRMSrcMem, (outs GR16:$dst), (ins opaquemem:$src), + "lds{w}\t{$src, $dst|$dst, $src}", []>, OpSize16, + Requires<[Not64BitMode]>; +def LDS32rm : I<0xc5, MRMSrcMem, (outs GR32:$dst), (ins opaquemem:$src), + "lds{l}\t{$src, $dst|$dst, $src}", []>, OpSize32, + Requires<[Not64BitMode]>; + +def LSS16rm : I<0xb2, MRMSrcMem, (outs GR16:$dst), (ins opaquemem:$src), + "lss{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize16; +def LSS32rm : I<0xb2, MRMSrcMem, (outs GR32:$dst), (ins opaquemem:$src), + "lss{l}\t{$src, $dst|$dst, $src}", []>, TB, OpSize32; +def LSS64rm : RI<0xb2, MRMSrcMem, (outs GR64:$dst), (ins opaquemem:$src), + "lss{q}\t{$src, $dst|$dst, $src}", []>, TB; + +def LES16rm : I<0xc4, MRMSrcMem, (outs GR16:$dst), (ins opaquemem:$src), + "les{w}\t{$src, $dst|$dst, $src}", []>, OpSize16, + Requires<[Not64BitMode]>; +def LES32rm : I<0xc4, MRMSrcMem, (outs GR32:$dst), (ins opaquemem:$src), + "les{l}\t{$src, $dst|$dst, $src}", []>, OpSize32, + Requires<[Not64BitMode]>; + +def LFS16rm : I<0xb4, MRMSrcMem, (outs GR16:$dst), (ins opaquemem:$src), + "lfs{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize16; +def LFS32rm : I<0xb4, MRMSrcMem, (outs GR32:$dst), (ins opaquemem:$src), + "lfs{l}\t{$src, $dst|$dst, $src}", []>, TB, OpSize32; +def LFS64rm : RI<0xb4, MRMSrcMem, (outs GR64:$dst), (ins opaquemem:$src), + "lfs{q}\t{$src, $dst|$dst, $src}", []>, TB; + +def LGS16rm : I<0xb5, MRMSrcMem, (outs GR16:$dst), (ins opaquemem:$src), + "lgs{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize16; +def LGS32rm : I<0xb5, MRMSrcMem, (outs GR32:$dst), (ins opaquemem:$src), + "lgs{l}\t{$src, $dst|$dst, $src}", []>, TB, OpSize32; + +def LGS64rm : RI<0xb5, MRMSrcMem, (outs GR64:$dst), (ins opaquemem:$src), + "lgs\t{$src, $dst|$dst, $src}", []>, TB; + +def VERRr : I<0x00, MRM4r, (outs), (ins GR16:$seg), "verr\t$seg", []>, TB, NotMemoryFoldable; +def VERWr : I<0x00, MRM5r, (outs), (ins GR16:$seg), "verw\t$seg", []>, TB, NotMemoryFoldable; +let mayLoad = 1 in { +def VERRm : I<0x00, MRM4m, (outs), (ins i16mem:$seg), "verr\t$seg", []>, TB, NotMemoryFoldable; +def VERWm : I<0x00, MRM5m, (outs), (ins i16mem:$seg), "verw\t$seg", []>, TB, NotMemoryFoldable; +} +} // SchedRW + +//===----------------------------------------------------------------------===// +// Descriptor-table support instructions + +let SchedRW = [WriteSystem] in { +def SGDT16m : I<0x01, MRM0m, (outs), (ins opaquemem:$dst), + "sgdt{w}\t$dst", []>, TB, OpSize16, Requires<[Not64BitMode]>; +def SGDT32m : I<0x01, MRM0m, (outs), (ins opaquemem:$dst), + "sgdt{l}\t$dst", []>, OpSize32, TB, Requires <[Not64BitMode]>; +def SGDT64m : I<0x01, MRM0m, (outs), (ins opaquemem:$dst), + "sgdt{q}\t$dst", []>, TB, Requires <[In64BitMode]>; +def SIDT16m : I<0x01, MRM1m, (outs), (ins opaquemem:$dst), + "sidt{w}\t$dst", []>, TB, OpSize16, Requires<[Not64BitMode]>; +def SIDT32m : I<0x01, MRM1m, (outs), (ins opaquemem:$dst), + "sidt{l}\t$dst", []>, OpSize32, TB, Requires <[Not64BitMode]>; +def SIDT64m : I<0x01, MRM1m, (outs), (ins opaquemem:$dst), + "sidt{q}\t$dst", []>, TB, Requires <[In64BitMode]>; +def SLDT16r : I<0x00, MRM0r, (outs GR16:$dst), (ins), + "sldt{w}\t$dst", []>, TB, OpSize16; +let mayStore = 1 in +def SLDT16m : I<0x00, MRM0m, (outs), (ins i16mem:$dst), + "sldt{w}\t$dst", []>, TB; +def SLDT32r : I<0x00, MRM0r, (outs GR32:$dst), (ins), + "sldt{l}\t$dst", []>, OpSize32, TB; + +// LLDT is not interpreted specially in 64-bit mode because there is no sign +// extension. +def SLDT64r : RI<0x00, MRM0r, (outs GR64:$dst), (ins), + "sldt{q}\t$dst", []>, TB, Requires<[In64BitMode]>; + +def LGDT16m : I<0x01, MRM2m, (outs), (ins opaquemem:$src), + "lgdt{w}\t$src", []>, TB, OpSize16, Requires<[Not64BitMode]>; +def LGDT32m : I<0x01, MRM2m, (outs), (ins opaquemem:$src), + "lgdt{l}\t$src", []>, OpSize32, TB, Requires<[Not64BitMode]>; +def LGDT64m : I<0x01, MRM2m, (outs), (ins opaquemem:$src), + "lgdt{q}\t$src", []>, TB, Requires<[In64BitMode]>; +def LIDT16m : I<0x01, MRM3m, (outs), (ins opaquemem:$src), + "lidt{w}\t$src", []>, TB, OpSize16, Requires<[Not64BitMode]>; +def LIDT32m : I<0x01, MRM3m, (outs), (ins opaquemem:$src), + "lidt{l}\t$src", []>, OpSize32, TB, Requires<[Not64BitMode]>; +def LIDT64m : I<0x01, MRM3m, (outs), (ins opaquemem:$src), + "lidt{q}\t$src", []>, TB, Requires<[In64BitMode]>; +def LLDT16r : I<0x00, MRM2r, (outs), (ins GR16:$src), + "lldt{w}\t$src", []>, TB, NotMemoryFoldable; +let mayLoad = 1 in +def LLDT16m : I<0x00, MRM2m, (outs), (ins i16mem:$src), + "lldt{w}\t$src", []>, TB, NotMemoryFoldable; +} // SchedRW + +//===----------------------------------------------------------------------===// +// Specialized register support +let SchedRW = [WriteSystem] in { +let Uses = [EAX, ECX, EDX] in +def WRMSR : I<0x30, RawFrm, (outs), (ins), "wrmsr", []>, TB; +let Defs = [EAX, EDX], Uses = [ECX] in +def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", []>, TB; + +let Defs = [RAX, RDX], Uses = [ECX] in + def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", [(X86rdpmc)]>, TB; + +def SMSW16r : I<0x01, MRM4r, (outs GR16:$dst), (ins), + "smsw{w}\t$dst", []>, OpSize16, TB; +def SMSW32r : I<0x01, MRM4r, (outs GR32:$dst), (ins), + "smsw{l}\t$dst", []>, OpSize32, TB; +// no m form encodable; use SMSW16m +def SMSW64r : RI<0x01, MRM4r, (outs GR64:$dst), (ins), + "smsw{q}\t$dst", []>, TB; + +// For memory operands, there is only a 16-bit form +def SMSW16m : I<0x01, MRM4m, (outs), (ins i16mem:$dst), + "smsw{w}\t$dst", []>, TB; + +def LMSW16r : I<0x01, MRM6r, (outs), (ins GR16:$src), + "lmsw{w}\t$src", []>, TB, NotMemoryFoldable; +let mayLoad = 1 in +def LMSW16m : I<0x01, MRM6m, (outs), (ins i16mem:$src), + "lmsw{w}\t$src", []>, TB, NotMemoryFoldable; + +let Defs = [EAX, EBX, ECX, EDX], Uses = [EAX, ECX] in + def CPUID : I<0xA2, RawFrm, (outs), (ins), "cpuid", []>, TB; +} // SchedRW + +//===----------------------------------------------------------------------===// +// Cache instructions +let SchedRW = [WriteSystem] in { +def INVD : I<0x08, RawFrm, (outs), (ins), "invd", []>, TB; +def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", [(int_x86_wbinvd)]>, TB; + +// wbnoinvd is like wbinvd, except without invalidation +// encoding: like wbinvd + an 0xF3 prefix +def WBNOINVD : I<0x09, RawFrm, (outs), (ins), "wbnoinvd", + [(int_x86_wbnoinvd)]>, XS, + Requires<[HasWBNOINVD]>; +} // SchedRW + +//===----------------------------------------------------------------------===// +// CET instructions +// Use with caution, availability is not predicated on features. +let SchedRW = [WriteSystem] in { + let Uses = [SSP] in { + let Defs = [SSP] in { + def INCSSPD : I<0xAE, MRM5r, (outs), (ins GR32:$src), "incsspd\t$src", + [(int_x86_incsspd GR32:$src)]>, XS; + def INCSSPQ : RI<0xAE, MRM5r, (outs), (ins GR64:$src), "incsspq\t$src", + [(int_x86_incsspq GR64:$src)]>, XS; + } // Defs SSP + + let Constraints = "$src = $dst" in { + def RDSSPD : I<0x1E, MRM1r, (outs GR32:$dst), (ins GR32:$src), + "rdsspd\t$dst", + [(set GR32:$dst, (int_x86_rdsspd GR32:$src))]>, XS; + def RDSSPQ : RI<0x1E, MRM1r, (outs GR64:$dst), (ins GR64:$src), + "rdsspq\t$dst", + [(set GR64:$dst, (int_x86_rdsspq GR64:$src))]>, XS; + } + + let Defs = [SSP] in { + def SAVEPREVSSP : I<0x01, MRM_EA, (outs), (ins), "saveprevssp", + [(int_x86_saveprevssp)]>, XS; + def RSTORSSP : I<0x01, MRM5m, (outs), (ins i32mem:$src), + "rstorssp\t$src", + [(int_x86_rstorssp addr:$src)]>, XS; + } // Defs SSP + } // Uses SSP + + def WRSSD : I<0xF6, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), + "wrssd\t{$src, $dst|$dst, $src}", + [(int_x86_wrssd GR32:$src, addr:$dst)]>, T8PS; + def WRSSQ : RI<0xF6, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), + "wrssq\t{$src, $dst|$dst, $src}", + [(int_x86_wrssq GR64:$src, addr:$dst)]>, T8PS; + def WRUSSD : I<0xF5, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), + "wrussd\t{$src, $dst|$dst, $src}", + [(int_x86_wrussd GR32:$src, addr:$dst)]>, T8PD; + def WRUSSQ : RI<0xF5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), + "wrussq\t{$src, $dst|$dst, $src}", + [(int_x86_wrussq GR64:$src, addr:$dst)]>, T8PD; + + let Defs = [SSP] in { + let Uses = [SSP] in { + def SETSSBSY : I<0x01, MRM_E8, (outs), (ins), "setssbsy", + [(int_x86_setssbsy)]>, XS; + } // Uses SSP + + def CLRSSBSY : I<0xAE, MRM6m, (outs), (ins i32mem:$src), + "clrssbsy\t$src", + [(int_x86_clrssbsy addr:$src)]>, XS; + } // Defs SSP +} // SchedRW + +let SchedRW = [WriteSystem] in { + def ENDBR64 : I<0x1E, MRM_FA, (outs), (ins), "endbr64", []>, XS; + def ENDBR32 : I<0x1E, MRM_FB, (outs), (ins), "endbr32", []>, XS; +} // SchedRW + +//===----------------------------------------------------------------------===// +// XSAVE instructions +let SchedRW = [WriteSystem] in { +let Predicates = [HasXSAVE] in { +let Defs = [EDX, EAX], Uses = [ECX] in + def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, TB; + +let Uses = [EDX, EAX, ECX] in + def XSETBV : I<0x01, MRM_D1, (outs), (ins), + "xsetbv", + [(int_x86_xsetbv ECX, EDX, EAX)]>, TB; + +} // HasXSAVE + +let Uses = [EDX, EAX] in { +def XSAVE : I<0xAE, MRM4m, (outs), (ins opaquemem:$dst), + "xsave\t$dst", + [(int_x86_xsave addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVE]>; +def XSAVE64 : RI<0xAE, MRM4m, (outs), (ins opaquemem:$dst), + "xsave64\t$dst", + [(int_x86_xsave64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVE, In64BitMode]>; +def XRSTOR : I<0xAE, MRM5m, (outs), (ins opaquemem:$dst), + "xrstor\t$dst", + [(int_x86_xrstor addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVE]>; +def XRSTOR64 : RI<0xAE, MRM5m, (outs), (ins opaquemem:$dst), + "xrstor64\t$dst", + [(int_x86_xrstor64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVE, In64BitMode]>; +def XSAVEOPT : I<0xAE, MRM6m, (outs), (ins opaquemem:$dst), + "xsaveopt\t$dst", + [(int_x86_xsaveopt addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVEOPT]>; +def XSAVEOPT64 : RI<0xAE, MRM6m, (outs), (ins opaquemem:$dst), + "xsaveopt64\t$dst", + [(int_x86_xsaveopt64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVEOPT, In64BitMode]>; +def XSAVEC : I<0xC7, MRM4m, (outs), (ins opaquemem:$dst), + "xsavec\t$dst", + [(int_x86_xsavec addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVEC]>; +def XSAVEC64 : RI<0xC7, MRM4m, (outs), (ins opaquemem:$dst), + "xsavec64\t$dst", + [(int_x86_xsavec64 addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVEC, In64BitMode]>; +def XSAVES : I<0xC7, MRM5m, (outs), (ins opaquemem:$dst), + "xsaves\t$dst", + [(int_x86_xsaves addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVES]>; +def XSAVES64 : RI<0xC7, MRM5m, (outs), (ins opaquemem:$dst), + "xsaves64\t$dst", + [(int_x86_xsaves64 addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVE, In64BitMode]>; +def XRSTORS : I<0xC7, MRM3m, (outs), (ins opaquemem:$dst), + "xrstors\t$dst", + [(int_x86_xrstors addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVES]>; +def XRSTORS64 : RI<0xC7, MRM3m, (outs), (ins opaquemem:$dst), + "xrstors64\t$dst", + [(int_x86_xrstors64 addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVES, In64BitMode]>; +} // Uses +} // SchedRW + +//===----------------------------------------------------------------------===// +// VIA PadLock crypto instructions +let Defs = [RAX, RDI], Uses = [RDX, RDI], SchedRW = [WriteSystem] in + def XSTORE : I<0xa7, MRM_C0, (outs), (ins), "xstore", []>, TB; + +def : InstAlias<"xstorerng", (XSTORE)>; + +let SchedRW = [WriteSystem] in { +let Defs = [RSI, RDI], Uses = [RBX, RDX, RSI, RDI] in { + def XCRYPTECB : I<0xa7, MRM_C8, (outs), (ins), "xcryptecb", []>, TB; + def XCRYPTCBC : I<0xa7, MRM_D0, (outs), (ins), "xcryptcbc", []>, TB; + def XCRYPTCTR : I<0xa7, MRM_D8, (outs), (ins), "xcryptctr", []>, TB; + def XCRYPTCFB : I<0xa7, MRM_E0, (outs), (ins), "xcryptcfb", []>, TB; + def XCRYPTOFB : I<0xa7, MRM_E8, (outs), (ins), "xcryptofb", []>, TB; +} + +let Defs = [RAX, RSI, RDI], Uses = [RAX, RSI, RDI] in { + def XSHA1 : I<0xa6, MRM_C8, (outs), (ins), "xsha1", []>, TB; + def XSHA256 : I<0xa6, MRM_D0, (outs), (ins), "xsha256", []>, TB; +} +let Defs = [RAX, RDX, RSI], Uses = [RAX, RSI] in + def MONTMUL : I<0xa6, MRM_C0, (outs), (ins), "montmul", []>, TB; +} // SchedRW + +/* +//==-----------------------------------------------------------------------===// +// PKU - enable protection key +let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in { + def WRPKRU : PseudoI<(outs), (ins GR32:$src), + [(int_x86_wrpkru GR32:$src)]>; + def RDPKRU : PseudoI<(outs GR32:$dst), (ins), + [(set GR32:$dst, (int_x86_rdpkru))]>; +} +*/ + +let SchedRW = [WriteSystem] in { +let Defs = [EAX, EDX], Uses = [ECX] in + def RDPKRUr : I<0x01, MRM_EE, (outs), (ins), "rdpkru", []>, TB; +let Uses = [EAX, ECX, EDX] in + def WRPKRUr : I<0x01, MRM_EF, (outs), (ins), "wrpkru", []>, TB; +} // SchedRW + +//===----------------------------------------------------------------------===// +// FS/GS Base Instructions +let Predicates = [HasFSGSBase, In64BitMode], SchedRW = [WriteSystem] in { + def RDFSBASE : I<0xAE, MRM0r, (outs GR32:$dst), (ins), + "rdfsbase{l}\t$dst", + [(set GR32:$dst, (int_x86_rdfsbase_32))]>, XS; + def RDFSBASE64 : RI<0xAE, MRM0r, (outs GR64:$dst), (ins), + "rdfsbase{q}\t$dst", + [(set GR64:$dst, (int_x86_rdfsbase_64))]>, XS; + def RDGSBASE : I<0xAE, MRM1r, (outs GR32:$dst), (ins), + "rdgsbase{l}\t$dst", + [(set GR32:$dst, (int_x86_rdgsbase_32))]>, XS; + def RDGSBASE64 : RI<0xAE, MRM1r, (outs GR64:$dst), (ins), + "rdgsbase{q}\t$dst", + [(set GR64:$dst, (int_x86_rdgsbase_64))]>, XS; + def WRFSBASE : I<0xAE, MRM2r, (outs), (ins GR32:$src), + "wrfsbase{l}\t$src", + [(int_x86_wrfsbase_32 GR32:$src)]>, XS; + def WRFSBASE64 : RI<0xAE, MRM2r, (outs), (ins GR64:$src), + "wrfsbase{q}\t$src", + [(int_x86_wrfsbase_64 GR64:$src)]>, XS; + def WRGSBASE : I<0xAE, MRM3r, (outs), (ins GR32:$src), + "wrgsbase{l}\t$src", + [(int_x86_wrgsbase_32 GR32:$src)]>, XS; + def WRGSBASE64 : RI<0xAE, MRM3r, (outs), (ins GR64:$src), + "wrgsbase{q}\t$src", + [(int_x86_wrgsbase_64 GR64:$src)]>, XS; +} + +//===----------------------------------------------------------------------===// +// INVPCID Instruction +let SchedRW = [WriteSystem] in { +def INVPCID32 : I<0x82, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2), + "invpcid\t{$src2, $src1|$src1, $src2}", + [(int_x86_invpcid GR32:$src1, addr:$src2)]>, T8PD, + Requires<[Not64BitMode, HasINVPCID]>; +def INVPCID64 : I<0x82, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2), + "invpcid\t{$src2, $src1|$src1, $src2}", []>, T8PD, + Requires<[In64BitMode, HasINVPCID]>; +} // SchedRW + +let Predicates = [In64BitMode, HasINVPCID] in { + // The instruction can only use a 64 bit register as the register argument + // in 64 bit mode, while the intrinsic only accepts a 32 bit argument + // corresponding to it. + // The accepted values for now are 0,1,2,3 anyways (see Intel SDM -- INVCPID + // type),/ so it doesn't hurt us that one can't supply a 64 bit value here. + def : Pat<(int_x86_invpcid GR32:$src1, addr:$src2), + (INVPCID64 + (SUBREG_TO_REG (i64 0), (MOV32rr GR32:$src1), sub_32bit), + addr:$src2)>; +} + + +//===----------------------------------------------------------------------===// +// SMAP Instruction +let Defs = [EFLAGS], SchedRW = [WriteSystem] in { + def CLAC : I<0x01, MRM_CA, (outs), (ins), "clac", []>, TB; + def STAC : I<0x01, MRM_CB, (outs), (ins), "stac", []>, TB; +} + +//===----------------------------------------------------------------------===// +// SMX Instruction +let SchedRW = [WriteSystem] in { +let Uses = [RAX, RBX, RCX, RDX], Defs = [RAX, RBX, RCX] in { + def GETSEC : I<0x37, RawFrm, (outs), (ins), "getsec", []>, TB; +} // Uses, Defs +} // SchedRW + +//===----------------------------------------------------------------------===// +// TS flag control instruction. +let SchedRW = [WriteSystem] in { +def CLTS : I<0x06, RawFrm, (outs), (ins), "clts", []>, TB; +} + +//===----------------------------------------------------------------------===// +// IF (inside EFLAGS) management instructions. +let SchedRW = [WriteSystem], Uses = [EFLAGS], Defs = [EFLAGS] in { +def CLI : I<0xFA, RawFrm, (outs), (ins), "cli", []>; +def STI : I<0xFB, RawFrm, (outs), (ins), "sti", []>; +} + +//===----------------------------------------------------------------------===// +// RDPID Instruction +let SchedRW = [WriteSystem] in { +def RDPID32 : I<0xC7, MRM7r, (outs GR32:$dst), (ins), + "rdpid\t$dst", [(set GR32:$dst, (int_x86_rdpid))]>, XS, + Requires<[Not64BitMode, HasRDPID]>; +def RDPID64 : I<0xC7, MRM7r, (outs GR64:$dst), (ins), "rdpid\t$dst", []>, XS, + Requires<[In64BitMode, HasRDPID]>; +} // SchedRW + +let Predicates = [In64BitMode, HasRDPID] in { + // Due to silly instruction definition, we have to compensate for the + // instruction outputing a 64-bit register. + def : Pat<(int_x86_rdpid), + (EXTRACT_SUBREG (RDPID64), sub_32bit)>; +} + + +//===----------------------------------------------------------------------===// +// PTWRITE Instruction - Write Data to a Processor Trace Packet +let SchedRW = [WriteSystem] in { +def PTWRITEm: I<0xAE, MRM4m, (outs), (ins i32mem:$dst), + "ptwrite{l}\t$dst", [(int_x86_ptwrite32 (loadi32 addr:$dst))]>, XS, + Requires<[HasPTWRITE]>; +def PTWRITE64m : RI<0xAE, MRM4m, (outs), (ins i64mem:$dst), + "ptwrite{q}\t$dst", [(int_x86_ptwrite64 (loadi64 addr:$dst))]>, XS, + Requires<[In64BitMode, HasPTWRITE]>; + +def PTWRITEr : I<0xAE, MRM4r, (outs), (ins GR32:$dst), + "ptwrite{l}\t$dst", [(int_x86_ptwrite32 GR32:$dst)]>, XS, + Requires<[HasPTWRITE]>; +def PTWRITE64r : RI<0xAE, MRM4r, (outs), (ins GR64:$dst), + "ptwrite{q}\t$dst", [(int_x86_ptwrite64 GR64:$dst)]>, XS, + Requires<[In64BitMode, HasPTWRITE]>; +} // SchedRW + +//===----------------------------------------------------------------------===// +// Platform Configuration instruction + +// From ISA docs: +// "This instruction is used to execute functions for configuring platform +// features. +// EAX: Leaf function to be invoked. +// RBX/RCX/RDX: Leaf-specific purpose." +// "Successful execution of the leaf clears RAX (set to zero) and ZF, CF, PF, +// AF, OF, and SF are cleared. In case of failure, the failure reason is +// indicated in RAX with ZF set to 1 and CF, PF, AF, OF, and SF are cleared." +// Thus all these mentioned registers are considered clobbered. + +let SchedRW = [WriteSystem] in { +let Uses = [RAX, RBX, RCX, RDX], Defs = [RAX, RBX, RCX, RDX, EFLAGS] in + def PCONFIG : I<0x01, MRM_C5, (outs), (ins), "pconfig", []>, TB, + Requires<[HasPCONFIG]>; +} // SchedRW diff --git a/capstone/suite/synctools/tablegen/X86/back/X86InstrTSX.td b/capstone/suite/synctools/tablegen/X86/back/X86InstrTSX.td new file mode 100644 index 000000000..b1fdd1807 --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86InstrTSX.td @@ -0,0 +1,60 @@ +//===-- X86InstrVMX.td - TSX Instruction Set Extension -----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the instructions that make up the Intel TSX instruction +// set. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// TSX instructions + +def X86xtest: SDNode<"X86ISD::XTEST", SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>, + [SDNPHasChain, SDNPSideEffect]>; + +let SchedRW = [WriteSystem] in { + +//let usesCustomInserter = 1 in +//def XBEGIN : I<0, Pseudo, (outs GR32:$dst), (ins), +// "# XBEGIN", [(set GR32:$dst, (int_x86_xbegin))]>, +// Requires<[HasRTM]>; + +let isBranch = 1, isTerminator = 1, Defs = [EAX] in { +def XBEGIN_2 : Ii16PCRel<0xc7, MRM_F8, (outs), (ins brtarget16:$dst), + "xbegin\t$dst", []>, OpSize16; +def XBEGIN_4 : Ii32PCRel<0xc7, MRM_F8, (outs), (ins brtarget32:$dst), + "xbegin\t$dst", []>, OpSize32; +} + +// Psuedo instruction to fake the definition of EAX on the fallback code path. +//let isPseudo = 1, Defs = [EAX] in { +//def XABORT_DEF : I<0, Pseudo, (outs), (ins), "# XABORT DEF", []>; +//} + +def XEND : I<0x01, MRM_D5, (outs), (ins), + "xend", [(int_x86_xend)]>, TB, Requires<[HasRTM]>; + +let Defs = [EFLAGS] in +def XTEST : I<0x01, MRM_D6, (outs), (ins), + "xtest", [(set EFLAGS, (X86xtest))]>, TB, Requires<[HasRTM]>; + +def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm), + "xabort\t$imm", + [(int_x86_xabort imm:$imm)]>, Requires<[HasRTM]>; +} // SchedRW + +// HLE prefixes +let SchedRW = [WriteSystem] in { + +let isAsmParserOnly = 1 in { +def XACQUIRE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "xacquire", []>; +def XRELEASE_PREFIX : I<0xF3, RawFrm, (outs), (ins), "xrelease", []>; +} + +} // SchedRW diff --git a/capstone/suite/synctools/tablegen/X86/back/X86InstrVMX.td b/capstone/suite/synctools/tablegen/X86/back/X86InstrVMX.td new file mode 100644 index 000000000..06a438ebf --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86InstrVMX.td @@ -0,0 +1,88 @@ +//===-- X86InstrVMX.td - VMX Instruction Set Extension -----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the instructions that make up the Intel VMX instruction +// set. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// VMX instructions + +let SchedRW = [WriteSystem] in { +// 66 0F 38 80 +def INVEPT32 : I<0x80, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2), + "invept\t{$src2, $src1|$src1, $src2}", []>, T8PD, + Requires<[Not64BitMode]>; +def INVEPT64 : I<0x80, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2), + "invept\t{$src2, $src1|$src1, $src2}", []>, T8PD, + Requires<[In64BitMode]>; + +// 66 0F 38 81 +def INVVPID32 : I<0x81, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2), + "invvpid\t{$src2, $src1|$src1, $src2}", []>, T8PD, + Requires<[Not64BitMode]>; +def INVVPID64 : I<0x81, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2), + "invvpid\t{$src2, $src1|$src1, $src2}", []>, T8PD, + Requires<[In64BitMode]>; + +// 0F 01 C1 +def VMCALL : I<0x01, MRM_C1, (outs), (ins), "vmcall", []>, TB; +def VMCLEARm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs), + "vmclear\t$vmcs", []>, PD; + +// OF 01 D4 +def VMFUNC : I<0x01, MRM_D4, (outs), (ins), "vmfunc", []>, TB; + +// 0F 01 C2 +def VMLAUNCH : I<0x01, MRM_C2, (outs), (ins), "vmlaunch", []>, TB; + +// 0F 01 C3 +def VMRESUME : I<0x01, MRM_C3, (outs), (ins), "vmresume", []>, TB; +def VMPTRLDm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs), + "vmptrld\t$vmcs", []>, PS; +def VMPTRSTm : I<0xC7, MRM7m, (outs), (ins i64mem:$vmcs), + "vmptrst\t$vmcs", []>, PS; +def VMREAD64rr : I<0x78, MRMDestReg, (outs GR64:$dst), (ins GR64:$src), + "vmread{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>, + NotMemoryFoldable; +def VMREAD32rr : I<0x78, MRMDestReg, (outs GR32:$dst), (ins GR32:$src), + "vmread{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>, + NotMemoryFoldable; + +let mayStore = 1 in { +def VMREAD64mr : I<0x78, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), + "vmread{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>, + NotMemoryFoldable; +def VMREAD32mr : I<0x78, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), + "vmread{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>, + NotMemoryFoldable; +} // mayStore + +def VMWRITE64rr : I<0x79, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), + "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>, + NotMemoryFoldable; +def VMWRITE32rr : I<0x79, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>, + NotMemoryFoldable; + +let mayLoad = 1 in { +def VMWRITE64rm : I<0x79, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), + "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>, + NotMemoryFoldable; +def VMWRITE32rm : I<0x79, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), + "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>, + NotMemoryFoldable; +} // mayLoad + +// 0F 01 C4 +def VMXOFF : I<0x01, MRM_C4, (outs), (ins), "vmxoff", []>, TB; +def VMXON : I<0xC7, MRM6m, (outs), (ins i64mem:$vmxon), + "vmxon\t$vmxon", []>, XS; +} // SchedRW diff --git a/capstone/suite/synctools/tablegen/X86/back/X86InstrVecCompiler.td b/capstone/suite/synctools/tablegen/X86/back/X86InstrVecCompiler.td new file mode 100644 index 000000000..322bdb74e --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86InstrVecCompiler.td @@ -0,0 +1,511 @@ +//===- X86InstrVecCompiler.td - Vector Compiler Patterns ---*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the various vector pseudo instructions used by the +// compiler, as well as Pat patterns used during instruction selection. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// No op bitconverts +//===----------------------------------------------------------------------===// + +// Bitcasts between 128-bit vector types. Return the original type since +// no instruction is needed for the conversion +def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>; +def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>; +def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>; +def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>; +def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>; +def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>; +def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>; +def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>; +def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>; +def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>; +def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>; +def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>; +def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>; +def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>; +def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>; +def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>; +def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>; +def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>; +def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>; +def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>; +def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>; +def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>; +def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>; +def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>; +def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>; +def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>; +def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>; +def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>; +def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>; +def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>; + +// Bitcasts between 256-bit vector types. Return the original type since +// no instruction is needed for the conversion +def : Pat<(v4i64 (bitconvert (v8i32 VR256:$src))), (v4i64 VR256:$src)>; +def : Pat<(v4i64 (bitconvert (v16i16 VR256:$src))), (v4i64 VR256:$src)>; +def : Pat<(v4i64 (bitconvert (v32i8 VR256:$src))), (v4i64 VR256:$src)>; +def : Pat<(v4i64 (bitconvert (v8f32 VR256:$src))), (v4i64 VR256:$src)>; +def : Pat<(v4i64 (bitconvert (v4f64 VR256:$src))), (v4i64 VR256:$src)>; +def : Pat<(v8i32 (bitconvert (v4i64 VR256:$src))), (v8i32 VR256:$src)>; +def : Pat<(v8i32 (bitconvert (v16i16 VR256:$src))), (v8i32 VR256:$src)>; +def : Pat<(v8i32 (bitconvert (v32i8 VR256:$src))), (v8i32 VR256:$src)>; +def : Pat<(v8i32 (bitconvert (v4f64 VR256:$src))), (v8i32 VR256:$src)>; +def : Pat<(v8i32 (bitconvert (v8f32 VR256:$src))), (v8i32 VR256:$src)>; +def : Pat<(v16i16 (bitconvert (v4i64 VR256:$src))), (v16i16 VR256:$src)>; +def : Pat<(v16i16 (bitconvert (v8i32 VR256:$src))), (v16i16 VR256:$src)>; +def : Pat<(v16i16 (bitconvert (v32i8 VR256:$src))), (v16i16 VR256:$src)>; +def : Pat<(v16i16 (bitconvert (v4f64 VR256:$src))), (v16i16 VR256:$src)>; +def : Pat<(v16i16 (bitconvert (v8f32 VR256:$src))), (v16i16 VR256:$src)>; +def : Pat<(v32i8 (bitconvert (v4i64 VR256:$src))), (v32i8 VR256:$src)>; +def : Pat<(v32i8 (bitconvert (v8i32 VR256:$src))), (v32i8 VR256:$src)>; +def : Pat<(v32i8 (bitconvert (v16i16 VR256:$src))), (v32i8 VR256:$src)>; +def : Pat<(v32i8 (bitconvert (v4f64 VR256:$src))), (v32i8 VR256:$src)>; +def : Pat<(v32i8 (bitconvert (v8f32 VR256:$src))), (v32i8 VR256:$src)>; +def : Pat<(v8f32 (bitconvert (v4i64 VR256:$src))), (v8f32 VR256:$src)>; +def : Pat<(v8f32 (bitconvert (v8i32 VR256:$src))), (v8f32 VR256:$src)>; +def : Pat<(v8f32 (bitconvert (v16i16 VR256:$src))), (v8f32 VR256:$src)>; +def : Pat<(v8f32 (bitconvert (v32i8 VR256:$src))), (v8f32 VR256:$src)>; +def : Pat<(v8f32 (bitconvert (v4f64 VR256:$src))), (v8f32 VR256:$src)>; +def : Pat<(v4f64 (bitconvert (v4i64 VR256:$src))), (v4f64 VR256:$src)>; +def : Pat<(v4f64 (bitconvert (v8i32 VR256:$src))), (v4f64 VR256:$src)>; +def : Pat<(v4f64 (bitconvert (v16i16 VR256:$src))), (v4f64 VR256:$src)>; +def : Pat<(v4f64 (bitconvert (v32i8 VR256:$src))), (v4f64 VR256:$src)>; +def : Pat<(v4f64 (bitconvert (v8f32 VR256:$src))), (v4f64 VR256:$src)>; + +// Bitcasts between 512-bit vector types. Return the original type since +// no instruction is needed for the conversion. +def : Pat<(v8f64 (bitconvert (v8i64 VR512:$src))), (v8f64 VR512:$src)>; +def : Pat<(v8f64 (bitconvert (v16i32 VR512:$src))), (v8f64 VR512:$src)>; +def : Pat<(v8f64 (bitconvert (v32i16 VR512:$src))), (v8f64 VR512:$src)>; +def : Pat<(v8f64 (bitconvert (v64i8 VR512:$src))), (v8f64 VR512:$src)>; +def : Pat<(v8f64 (bitconvert (v16f32 VR512:$src))), (v8f64 VR512:$src)>; +def : Pat<(v16f32 (bitconvert (v8i64 VR512:$src))), (v16f32 VR512:$src)>; +def : Pat<(v16f32 (bitconvert (v16i32 VR512:$src))), (v16f32 VR512:$src)>; +def : Pat<(v16f32 (bitconvert (v32i16 VR512:$src))), (v16f32 VR512:$src)>; +def : Pat<(v16f32 (bitconvert (v64i8 VR512:$src))), (v16f32 VR512:$src)>; +def : Pat<(v16f32 (bitconvert (v8f64 VR512:$src))), (v16f32 VR512:$src)>; +def : Pat<(v8i64 (bitconvert (v16i32 VR512:$src))), (v8i64 VR512:$src)>; +def : Pat<(v8i64 (bitconvert (v32i16 VR512:$src))), (v8i64 VR512:$src)>; +def : Pat<(v8i64 (bitconvert (v64i8 VR512:$src))), (v8i64 VR512:$src)>; +def : Pat<(v8i64 (bitconvert (v8f64 VR512:$src))), (v8i64 VR512:$src)>; +def : Pat<(v8i64 (bitconvert (v16f32 VR512:$src))), (v8i64 VR512:$src)>; +def : Pat<(v16i32 (bitconvert (v8i64 VR512:$src))), (v16i32 VR512:$src)>; +def : Pat<(v16i32 (bitconvert (v16f32 VR512:$src))), (v16i32 VR512:$src)>; +def : Pat<(v16i32 (bitconvert (v32i16 VR512:$src))), (v16i32 VR512:$src)>; +def : Pat<(v16i32 (bitconvert (v64i8 VR512:$src))), (v16i32 VR512:$src)>; +def : Pat<(v16i32 (bitconvert (v8f64 VR512:$src))), (v16i32 VR512:$src)>; +def : Pat<(v32i16 (bitconvert (v8i64 VR512:$src))), (v32i16 VR512:$src)>; +def : Pat<(v32i16 (bitconvert (v16i32 VR512:$src))), (v32i16 VR512:$src)>; +def : Pat<(v32i16 (bitconvert (v64i8 VR512:$src))), (v32i16 VR512:$src)>; +def : Pat<(v32i16 (bitconvert (v8f64 VR512:$src))), (v32i16 VR512:$src)>; +def : Pat<(v32i16 (bitconvert (v16f32 VR512:$src))), (v32i16 VR512:$src)>; +def : Pat<(v64i8 (bitconvert (v8i64 VR512:$src))), (v64i8 VR512:$src)>; +def : Pat<(v64i8 (bitconvert (v16i32 VR512:$src))), (v64i8 VR512:$src)>; +def : Pat<(v64i8 (bitconvert (v32i16 VR512:$src))), (v64i8 VR512:$src)>; +def : Pat<(v64i8 (bitconvert (v8f64 VR512:$src))), (v64i8 VR512:$src)>; +def : Pat<(v64i8 (bitconvert (v16f32 VR512:$src))), (v64i8 VR512:$src)>; + + +//===----------------------------------------------------------------------===// +// Non-instruction patterns +//===----------------------------------------------------------------------===// + +// A vector extract of the first f32/f64 position is a subregister copy +def : Pat<(f32 (extractelt (v4f32 VR128:$src), (iPTR 0))), + (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>; +def : Pat<(f64 (extractelt (v2f64 VR128:$src), (iPTR 0))), + (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64)>; + +// Implicitly promote a 32-bit scalar to a vector. +def : Pat<(v4f32 (scalar_to_vector FR32:$src)), + (COPY_TO_REGCLASS FR32:$src, VR128)>; +// Implicitly promote a 64-bit scalar to a vector. +def : Pat<(v2f64 (scalar_to_vector FR64:$src)), + (COPY_TO_REGCLASS FR64:$src, VR128)>; + + +//===----------------------------------------------------------------------===// +// Subvector tricks +//===----------------------------------------------------------------------===// + +// Patterns for insert_subvector/extract_subvector to/from index=0 +multiclass subvector_subreg_lowering<RegisterClass subRC, ValueType subVT, + RegisterClass RC, ValueType VT, + SubRegIndex subIdx> { + def : Pat<(subVT (extract_subvector (VT RC:$src), (iPTR 0))), + (subVT (EXTRACT_SUBREG RC:$src, subIdx))>; + + def : Pat<(VT (insert_subvector undef, subRC:$src, (iPTR 0))), + (VT (INSERT_SUBREG (IMPLICIT_DEF), subRC:$src, subIdx))>; +} + +// A 128-bit subvector extract from the first 256-bit vector position is a +// subregister copy that needs no instruction. Likewise, a 128-bit subvector +// insert to the first 256-bit vector position is a subregister copy that needs +// no instruction. +defm : subvector_subreg_lowering<VR128, v4i32, VR256, v8i32, sub_xmm>; +defm : subvector_subreg_lowering<VR128, v4f32, VR256, v8f32, sub_xmm>; +defm : subvector_subreg_lowering<VR128, v2i64, VR256, v4i64, sub_xmm>; +defm : subvector_subreg_lowering<VR128, v2f64, VR256, v4f64, sub_xmm>; +defm : subvector_subreg_lowering<VR128, v8i16, VR256, v16i16, sub_xmm>; +defm : subvector_subreg_lowering<VR128, v16i8, VR256, v32i8, sub_xmm>; + +// A 128-bit subvector extract from the first 512-bit vector position is a +// subregister copy that needs no instruction. Likewise, a 128-bit subvector +// insert to the first 512-bit vector position is a subregister copy that needs +// no instruction. +defm : subvector_subreg_lowering<VR128, v4i32, VR512, v16i32, sub_xmm>; +defm : subvector_subreg_lowering<VR128, v4f32, VR512, v16f32, sub_xmm>; +defm : subvector_subreg_lowering<VR128, v2i64, VR512, v8i64, sub_xmm>; +defm : subvector_subreg_lowering<VR128, v2f64, VR512, v8f64, sub_xmm>; +defm : subvector_subreg_lowering<VR128, v8i16, VR512, v32i16, sub_xmm>; +defm : subvector_subreg_lowering<VR128, v16i8, VR512, v64i8, sub_xmm>; + +// A 128-bit subvector extract from the first 512-bit vector position is a +// subregister copy that needs no instruction. Likewise, a 128-bit subvector +// insert to the first 512-bit vector position is a subregister copy that needs +// no instruction. +defm : subvector_subreg_lowering<VR256, v8i32, VR512, v16i32, sub_ymm>; +defm : subvector_subreg_lowering<VR256, v8f32, VR512, v16f32, sub_ymm>; +defm : subvector_subreg_lowering<VR256, v4i64, VR512, v8i64, sub_ymm>; +defm : subvector_subreg_lowering<VR256, v4f64, VR512, v8f64, sub_ymm>; +defm : subvector_subreg_lowering<VR256, v16i16, VR512, v32i16, sub_ymm>; +defm : subvector_subreg_lowering<VR256, v32i8, VR512, v64i8, sub_ymm>; + + +multiclass subvector_store_lowering<string AlignedStr, string UnalignedStr, + RegisterClass RC, ValueType DstTy, + ValueType SrcTy, SubRegIndex SubIdx> { + def : Pat<(alignedstore (DstTy (extract_subvector + (SrcTy RC:$src), (iPTR 0))), addr:$dst), + (!cast<Instruction>("VMOV"#AlignedStr#"mr") addr:$dst, + (DstTy (EXTRACT_SUBREG RC:$src, SubIdx)))>; + + def : Pat<(store (DstTy (extract_subvector + (SrcTy RC:$src), (iPTR 0))), addr:$dst), + (!cast<Instruction>("VMOV"#UnalignedStr#"mr") addr:$dst, + (DstTy (EXTRACT_SUBREG RC:$src, SubIdx)))>; +} + +let Predicates = [HasAVX, NoVLX] in { + defm : subvector_store_lowering<"APD", "UPD", VR256X, v2f64, v4f64, sub_xmm>; + defm : subvector_store_lowering<"APS", "UPS", VR256X, v4f32, v8f32, sub_xmm>; + defm : subvector_store_lowering<"DQA", "DQU", VR256X, v2i64, v4i64, sub_xmm>; + defm : subvector_store_lowering<"DQA", "DQU", VR256X, v4i32, v8i32, sub_xmm>; + defm : subvector_store_lowering<"DQA", "DQU", VR256X, v8i16, v16i16, sub_xmm>; + defm : subvector_store_lowering<"DQA", "DQU", VR256X, v16i8, v32i8, sub_xmm>; +} + +let Predicates = [HasVLX] in { + // Special patterns for storing subvector extracts of lower 128-bits + // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr + defm : subvector_store_lowering<"APDZ128", "UPDZ128", VR256X, v2f64, v4f64, + sub_xmm>; + defm : subvector_store_lowering<"APSZ128", "UPSZ128", VR256X, v4f32, v8f32, + sub_xmm>; + defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR256X, v2i64, + v4i64, sub_xmm>; + defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR256X, v4i32, + v8i32, sub_xmm>; + defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR256X, v8i16, + v16i16, sub_xmm>; + defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR256X, v16i8, + v32i8, sub_xmm>; + + // Special patterns for storing subvector extracts of lower 128-bits of 512. + // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr + defm : subvector_store_lowering<"APDZ128", "UPDZ128", VR512, v2f64, v8f64, + sub_xmm>; + defm : subvector_store_lowering<"APSZ128", "UPSZ128", VR512, v4f32, v16f32, + sub_xmm>; + defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR512, v2i64, + v8i64, sub_xmm>; + defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR512, v4i32, + v16i32, sub_xmm>; + defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR512, v8i16, + v32i16, sub_xmm>; + defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR512, v16i8, + v64i8, sub_xmm>; + + // Special patterns for storing subvector extracts of lower 256-bits of 512. + // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr + defm : subvector_store_lowering<"APDZ256", "UPDZ256", VR512, v4f64, v8f64, + sub_ymm>; + defm : subvector_store_lowering<"APSZ256", "UPSZ256", VR512, v8f32, v16f32, + sub_ymm>; + defm : subvector_store_lowering<"DQA64Z256", "DQU64Z256", VR512, v4i64, + v8i64, sub_ymm>; + defm : subvector_store_lowering<"DQA64Z256", "DQU64Z256", VR512, v8i32, + v16i32, sub_ymm>; + defm : subvector_store_lowering<"DQA64Z256", "DQU64Z256", VR512, v16i16, + v32i16, sub_ymm>; + defm : subvector_store_lowering<"DQA64Z256", "DQU64Z256", VR512, v32i8, + v64i8, sub_ymm>; +} + +// If we're inserting into an all zeros vector, just use a plain move which +// will zero the upper bits. A post-isel hook will take care of removing +// any moves that we can prove are unnecessary. +multiclass subvec_zero_lowering<string MoveStr, + RegisterClass RC, ValueType DstTy, + ValueType SrcTy, ValueType ZeroTy, + SubRegIndex SubIdx> { + def : Pat<(DstTy (insert_subvector (bitconvert (ZeroTy immAllZerosV)), + (SrcTy RC:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), + (SrcTy (!cast<Instruction>("VMOV"#MoveStr#"rr") RC:$src)), SubIdx)>; +} + +let Predicates = [HasAVX, NoVLX] in { + defm : subvec_zero_lowering<"APD", VR128, v4f64, v2f64, v8i32, sub_xmm>; + defm : subvec_zero_lowering<"APS", VR128, v8f32, v4f32, v8i32, sub_xmm>; + defm : subvec_zero_lowering<"DQA", VR128, v4i64, v2i64, v8i32, sub_xmm>; + defm : subvec_zero_lowering<"DQA", VR128, v8i32, v4i32, v8i32, sub_xmm>; + defm : subvec_zero_lowering<"DQA", VR128, v16i16, v8i16, v8i32, sub_xmm>; + defm : subvec_zero_lowering<"DQA", VR128, v32i8, v16i8, v8i32, sub_xmm>; +} + +let Predicates = [HasVLX] in { + defm : subvec_zero_lowering<"APDZ128", VR128X, v4f64, v2f64, v8i32, sub_xmm>; + defm : subvec_zero_lowering<"APSZ128", VR128X, v8f32, v4f32, v8i32, sub_xmm>; + defm : subvec_zero_lowering<"DQA64Z128", VR128X, v4i64, v2i64, v8i32, sub_xmm>; + defm : subvec_zero_lowering<"DQA64Z128", VR128X, v8i32, v4i32, v8i32, sub_xmm>; + defm : subvec_zero_lowering<"DQA64Z128", VR128X, v16i16, v8i16, v8i32, sub_xmm>; + defm : subvec_zero_lowering<"DQA64Z128", VR128X, v32i8, v16i8, v8i32, sub_xmm>; + + defm : subvec_zero_lowering<"APDZ128", VR128X, v8f64, v2f64, v16i32, sub_xmm>; + defm : subvec_zero_lowering<"APSZ128", VR128X, v16f32, v4f32, v16i32, sub_xmm>; + defm : subvec_zero_lowering<"DQA64Z128", VR128X, v8i64, v2i64, v16i32, sub_xmm>; + defm : subvec_zero_lowering<"DQA64Z128", VR128X, v16i32, v4i32, v16i32, sub_xmm>; + defm : subvec_zero_lowering<"DQA64Z128", VR128X, v32i16, v8i16, v16i32, sub_xmm>; + defm : subvec_zero_lowering<"DQA64Z128", VR128X, v64i8, v16i8, v16i32, sub_xmm>; + + defm : subvec_zero_lowering<"APDZ256", VR256X, v8f64, v4f64, v16i32, sub_ymm>; + defm : subvec_zero_lowering<"APSZ256", VR256X, v16f32, v8f32, v16i32, sub_ymm>; + defm : subvec_zero_lowering<"DQA64Z256", VR256X, v8i64, v4i64, v16i32, sub_ymm>; + defm : subvec_zero_lowering<"DQA64Z256", VR256X, v16i32, v8i32, v16i32, sub_ymm>; + defm : subvec_zero_lowering<"DQA64Z256", VR256X, v32i16, v16i16, v16i32, sub_ymm>; + defm : subvec_zero_lowering<"DQA64Z256", VR256X, v64i8, v32i8, v16i32, sub_ymm>; +} + +let Predicates = [HasAVX512, NoVLX] in { + defm : subvec_zero_lowering<"APD", VR128, v8f64, v2f64, v16i32, sub_xmm>; + defm : subvec_zero_lowering<"APS", VR128, v16f32, v4f32, v16i32, sub_xmm>; + defm : subvec_zero_lowering<"DQA", VR128, v8i64, v2i64, v16i32, sub_xmm>; + defm : subvec_zero_lowering<"DQA", VR128, v16i32, v4i32, v16i32, sub_xmm>; + defm : subvec_zero_lowering<"DQA", VR128, v32i16, v8i16, v16i32, sub_xmm>; + defm : subvec_zero_lowering<"DQA", VR128, v64i8, v16i8, v16i32, sub_xmm>; + + defm : subvec_zero_lowering<"APDY", VR256, v8f64, v4f64, v16i32, sub_ymm>; + defm : subvec_zero_lowering<"APSY", VR256, v16f32, v8f32, v16i32, sub_ymm>; + defm : subvec_zero_lowering<"DQAY", VR256, v8i64, v4i64, v16i32, sub_ymm>; + defm : subvec_zero_lowering<"DQAY", VR256, v16i32, v8i32, v16i32, sub_ymm>; + defm : subvec_zero_lowering<"DQAY", VR256, v32i16, v16i16, v16i32, sub_ymm>; + defm : subvec_zero_lowering<"DQAY", VR256, v64i8, v32i8, v16i32, sub_ymm>; +} + +class maskzeroupper<ValueType vt, RegisterClass RC> : + PatLeaf<(vt RC:$src), [{ + return isMaskZeroExtended(N); + }]>; + +def maskzeroupperv1i1 : maskzeroupper<v1i1, VK1>; +def maskzeroupperv2i1 : maskzeroupper<v2i1, VK2>; +def maskzeroupperv4i1 : maskzeroupper<v4i1, VK4>; +def maskzeroupperv8i1 : maskzeroupper<v8i1, VK8>; +def maskzeroupperv16i1 : maskzeroupper<v16i1, VK16>; +def maskzeroupperv32i1 : maskzeroupper<v32i1, VK32>; + +// The patterns determine if we can depend on the upper bits of a mask register +// being zeroed by the previous operation so that we can skip explicit +// zeroing. +let Predicates = [HasBWI] in { + def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV), + maskzeroupperv1i1:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK1:$src, VK32)>; + def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV), + maskzeroupperv8i1:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK8:$src, VK32)>; + def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV), + maskzeroupperv16i1:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK16:$src, VK32)>; + + def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV), + maskzeroupperv1i1:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK1:$src, VK64)>; + def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV), + maskzeroupperv8i1:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK8:$src, VK64)>; + def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV), + maskzeroupperv16i1:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK16:$src, VK64)>; + def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV), + maskzeroupperv32i1:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK32:$src, VK64)>; +} + +let Predicates = [HasAVX512] in { + def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV), + maskzeroupperv1i1:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK1:$src, VK16)>; + def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV), + maskzeroupperv8i1:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK8:$src, VK16)>; +} + +let Predicates = [HasDQI] in { + def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV), + maskzeroupperv1i1:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK1:$src, VK8)>; +} + +let Predicates = [HasVLX, HasDQI] in { + def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV), + maskzeroupperv2i1:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK2:$src, VK8)>; + def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV), + maskzeroupperv4i1:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK4:$src, VK8)>; +} + +let Predicates = [HasVLX] in { + def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV), + maskzeroupperv2i1:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK2:$src, VK16)>; + def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV), + maskzeroupperv4i1:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK4:$src, VK16)>; +} + +let Predicates = [HasBWI, HasVLX] in { + def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV), + maskzeroupperv2i1:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK2:$src, VK32)>; + def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV), + maskzeroupperv4i1:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK4:$src, VK32)>; + def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV), + maskzeroupperv2i1:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK2:$src, VK64)>; + def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV), + maskzeroupperv4i1:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK4:$src, VK64)>; +} + +// If the bits are not zero we have to fall back to explicitly zeroing by +// using shifts. +let Predicates = [HasAVX512] in { + def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV), + (v1i1 VK1:$mask), (iPTR 0))), + (KSHIFTRWri (KSHIFTLWri (COPY_TO_REGCLASS VK1:$mask, VK16), + (i8 15)), (i8 15))>; + + def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV), + (v2i1 VK2:$mask), (iPTR 0))), + (KSHIFTRWri (KSHIFTLWri (COPY_TO_REGCLASS VK2:$mask, VK16), + (i8 14)), (i8 14))>; + + def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV), + (v4i1 VK4:$mask), (iPTR 0))), + (KSHIFTRWri (KSHIFTLWri (COPY_TO_REGCLASS VK4:$mask, VK16), + (i8 12)), (i8 12))>; +} + +let Predicates = [HasAVX512, NoDQI] in { + def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV), + (v8i1 VK8:$mask), (iPTR 0))), + (KSHIFTRWri (KSHIFTLWri (COPY_TO_REGCLASS VK8:$mask, VK16), + (i8 8)), (i8 8))>; +} + +let Predicates = [HasDQI] in { + def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV), + (v8i1 VK8:$mask), (iPTR 0))), + (COPY_TO_REGCLASS (KMOVBkk VK8:$mask), VK16)>; + + def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV), + (v1i1 VK1:$mask), (iPTR 0))), + (KSHIFTRBri (KSHIFTLBri (COPY_TO_REGCLASS VK1:$mask, VK8), + (i8 7)), (i8 7))>; + def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV), + (v2i1 VK2:$mask), (iPTR 0))), + (KSHIFTRBri (KSHIFTLBri (COPY_TO_REGCLASS VK2:$mask, VK8), + (i8 6)), (i8 6))>; + def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV), + (v4i1 VK4:$mask), (iPTR 0))), + (KSHIFTRBri (KSHIFTLBri (COPY_TO_REGCLASS VK4:$mask, VK8), + (i8 4)), (i8 4))>; +} + +let Predicates = [HasBWI] in { + def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV), + (v16i1 VK16:$mask), (iPTR 0))), + (COPY_TO_REGCLASS (KMOVWkk VK16:$mask), VK32)>; + + def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV), + (v16i1 VK16:$mask), (iPTR 0))), + (COPY_TO_REGCLASS (KMOVWkk VK16:$mask), VK64)>; + def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV), + (v32i1 VK32:$mask), (iPTR 0))), + (COPY_TO_REGCLASS (KMOVDkk VK32:$mask), VK64)>; +} + +let Predicates = [HasBWI, NoDQI] in { + def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV), + (v8i1 VK8:$mask), (iPTR 0))), + (KSHIFTRDri (KSHIFTLDri (COPY_TO_REGCLASS VK8:$mask, VK32), + (i8 24)), (i8 24))>; + + def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV), + (v8i1 VK8:$mask), (iPTR 0))), + (KSHIFTRQri (KSHIFTLQri (COPY_TO_REGCLASS VK8:$mask, VK64), + (i8 56)), (i8 56))>; +} + +let Predicates = [HasBWI, HasDQI] in { + def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV), + (v8i1 VK8:$mask), (iPTR 0))), + (COPY_TO_REGCLASS (KMOVBkk VK8:$mask), VK32)>; + + def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV), + (v8i1 VK8:$mask), (iPTR 0))), + (COPY_TO_REGCLASS (KMOVBkk VK8:$mask), VK64)>; +} + +let Predicates = [HasBWI, HasVLX] in { + def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV), + (v1i1 VK1:$mask), (iPTR 0))), + (KSHIFTRDri (KSHIFTLDri (COPY_TO_REGCLASS VK1:$mask, VK32), + (i8 31)), (i8 31))>; + def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV), + (v2i1 VK2:$mask), (iPTR 0))), + (KSHIFTRDri (KSHIFTLDri (COPY_TO_REGCLASS VK2:$mask, VK32), + (i8 30)), (i8 30))>; + def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV), + (v4i1 VK4:$mask), (iPTR 0))), + (KSHIFTRDri (KSHIFTLDri (COPY_TO_REGCLASS VK4:$mask, VK32), + (i8 28)), (i8 28))>; + + def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV), + (v1i1 VK1:$mask), (iPTR 0))), + (KSHIFTRQri (KSHIFTLQri (COPY_TO_REGCLASS VK1:$mask, VK64), + (i8 63)), (i8 63))>; + def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV), + (v2i1 VK2:$mask), (iPTR 0))), + (KSHIFTRQri (KSHIFTLQri (COPY_TO_REGCLASS VK2:$mask, VK64), + (i8 62)), (i8 62))>; + def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV), + (v4i1 VK4:$mask), (iPTR 0))), + (KSHIFTRQri (KSHIFTLQri (COPY_TO_REGCLASS VK4:$mask, VK64), + (i8 60)), (i8 60))>; +} diff --git a/capstone/suite/synctools/tablegen/X86/back/X86InstrXOP.td b/capstone/suite/synctools/tablegen/X86/back/X86InstrXOP.td new file mode 100644 index 000000000..ff3e3be48 --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86InstrXOP.td @@ -0,0 +1,446 @@ +//===-- X86InstrXOP.td - XOP Instruction Set ---------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes XOP (eXtended OPerations) +// +//===----------------------------------------------------------------------===// + +multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> { + def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[SchedWritePHAdd.XMM]>; + def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP, + Sched<[SchedWritePHAdd.XMM.Folded, ReadAfterLd]>; +} + +let ExeDomain = SSEPackedInt in { + defm VPHSUBWD : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd, loadv2i64>; + defm VPHSUBDQ : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq, loadv2i64>; + defm VPHSUBBW : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw, loadv2i64>; + defm VPHADDWQ : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq, loadv2i64>; + defm VPHADDWD : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd, loadv2i64>; + defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq, loadv2i64>; + defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd, loadv2i64>; + defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq, loadv2i64>; + defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw, loadv2i64>; + defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq, loadv2i64>; + defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd, loadv2i64>; + defm VPHADDDQ : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq, loadv2i64>; + defm VPHADDBW : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, loadv2i64>; + defm VPHADDBQ : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, loadv2i64>; + defm VPHADDBD : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, loadv2i64>; +} + +// Scalar load 2 addr operand instructions +multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int, + Operand memop, ComplexPattern mem_cpat, + X86FoldableSchedWrite sched> { + def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[sched]>; + def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (Int (bitconvert mem_cpat:$src)))]>, XOP, + Sched<[sched.Folded, ReadAfterLd]>; +} + +multiclass xop2op128<bits<8> opc, string OpcodeStr, Intrinsic Int, + PatFrag memop, X86FoldableSchedWrite sched> { + def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[sched]>; + def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP, + Sched<[sched.Folded, ReadAfterLd]>; +} + +multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int, + PatFrag memop, X86FoldableSchedWrite sched> { + def Yrr : IXOP<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (Int VR256:$src))]>, XOP, VEX_L, Sched<[sched]>; + def Yrm : IXOP<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP, VEX_L, + Sched<[sched.Folded, ReadAfterLd]>; +} + +let ExeDomain = SSEPackedSingle in { + defm VFRCZSS : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss, + ssmem, sse_load_f32, SchedWriteFRnd.Scl>; + defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, loadv4f32, + SchedWriteFRnd.XMM>; + defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256, loadv8f32, + SchedWriteFRnd.YMM>; +} + +let ExeDomain = SSEPackedDouble in { + defm VFRCZSD : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd, + sdmem, sse_load_f64, SchedWriteFRnd.Scl>; + defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, loadv2f64, + SchedWriteFRnd.XMM>; + defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, loadv4f64, + SchedWriteFRnd.YMM>; +} + +multiclass xop3op<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType vt128, X86FoldableSchedWrite sched> { + def rr : IXOP<opc, MRMSrcReg4VOp3, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2))))]>, + XOP, Sched<[sched]>; + def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (vt128 (OpNode (vt128 VR128:$src1), + (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>, + XOP_4V, VEX_W, Sched<[sched.Folded, ReadAfterLd]>; + def mr : IXOP<opc, MRMSrcMem4VOp3, (outs VR128:$dst), + (ins i128mem:$src1, VR128:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))), + (vt128 VR128:$src2))))]>, + XOP, Sched<[sched.Folded, ReadAfterLd]>; + // For disassembler + let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in + def rr_REV : IXOP<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + []>, + XOP_4V, VEX_W, Sched<[sched]>, FoldGenData<NAME#rr>; +} + +let ExeDomain = SSEPackedInt in { + defm VPROTB : xop3op<0x90, "vprotb", rotl, v16i8, SchedWriteVarVecShift.XMM>; + defm VPROTD : xop3op<0x92, "vprotd", rotl, v4i32, SchedWriteVarVecShift.XMM>; + defm VPROTQ : xop3op<0x93, "vprotq", rotl, v2i64, SchedWriteVarVecShift.XMM>; + defm VPROTW : xop3op<0x91, "vprotw", rotl, v8i16, SchedWriteVarVecShift.XMM>; + defm VPSHAB : xop3op<0x98, "vpshab", X86vpsha, v16i8, SchedWriteVarVecShift.XMM>; + defm VPSHAD : xop3op<0x9A, "vpshad", X86vpsha, v4i32, SchedWriteVarVecShift.XMM>; + defm VPSHAQ : xop3op<0x9B, "vpshaq", X86vpsha, v2i64, SchedWriteVarVecShift.XMM>; + defm VPSHAW : xop3op<0x99, "vpshaw", X86vpsha, v8i16, SchedWriteVarVecShift.XMM>; + defm VPSHLB : xop3op<0x94, "vpshlb", X86vpshl, v16i8, SchedWriteVarVecShift.XMM>; + defm VPSHLD : xop3op<0x96, "vpshld", X86vpshl, v4i32, SchedWriteVarVecShift.XMM>; + defm VPSHLQ : xop3op<0x97, "vpshlq", X86vpshl, v2i64, SchedWriteVarVecShift.XMM>; + defm VPSHLW : xop3op<0x95, "vpshlw", X86vpshl, v8i16, SchedWriteVarVecShift.XMM>; +} + +multiclass xop3opimm<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType vt128, X86FoldableSchedWrite sched> { + def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, u8imm:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (vt128 (OpNode (vt128 VR128:$src1), imm:$src2)))]>, + XOP, Sched<[sched]>; + def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins i128mem:$src1, u8imm:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))), imm:$src2)))]>, + XOP, Sched<[sched.Folded, ReadAfterLd]>; +} + +let ExeDomain = SSEPackedInt in { + defm VPROTB : xop3opimm<0xC0, "vprotb", X86vrotli, v16i8, + SchedWriteVecShiftImm.XMM>; + defm VPROTD : xop3opimm<0xC2, "vprotd", X86vrotli, v4i32, + SchedWriteVecShiftImm.XMM>; + defm VPROTQ : xop3opimm<0xC3, "vprotq", X86vrotli, v2i64, + SchedWriteVecShiftImm.XMM>; + defm VPROTW : xop3opimm<0xC1, "vprotw", X86vrotli, v8i16, + SchedWriteVecShiftImm.XMM>; +} + +// Instruction where second source can be memory, but third must be register +multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int, + X86FoldableSchedWrite sched> { + let isCommutable = 1 in + def rr : IXOPi8Reg<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, XOP_4V, + Sched<[sched]>; + def rm : IXOPi8Reg<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2)), + VR128:$src3))]>, XOP_4V, Sched<[sched.Folded, ReadAfterLd]>; +} + +let ExeDomain = SSEPackedInt in { + defm VPMADCSWD : xop4opm2<0xB6, "vpmadcswd", + int_x86_xop_vpmadcswd, SchedWriteVecIMul.XMM>; + defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd", + int_x86_xop_vpmadcsswd, SchedWriteVecIMul.XMM>; + defm VPMACSWW : xop4opm2<0x95, "vpmacsww", + int_x86_xop_vpmacsww, SchedWriteVecIMul.XMM>; + defm VPMACSWD : xop4opm2<0x96, "vpmacswd", + int_x86_xop_vpmacswd, SchedWriteVecIMul.XMM>; + defm VPMACSSWW : xop4opm2<0x85, "vpmacssww", + int_x86_xop_vpmacssww, SchedWriteVecIMul.XMM>; + defm VPMACSSWD : xop4opm2<0x86, "vpmacsswd", + int_x86_xop_vpmacsswd, SchedWriteVecIMul.XMM>; + defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql", + int_x86_xop_vpmacssdql, SchedWritePMULLD.XMM>; + defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh", + int_x86_xop_vpmacssdqh, SchedWritePMULLD.XMM>; + defm VPMACSSDD : xop4opm2<0x8E, "vpmacssdd", + int_x86_xop_vpmacssdd, SchedWritePMULLD.XMM>; + defm VPMACSDQL : xop4opm2<0x97, "vpmacsdql", + int_x86_xop_vpmacsdql, SchedWritePMULLD.XMM>; + defm VPMACSDQH : xop4opm2<0x9F, "vpmacsdqh", + int_x86_xop_vpmacsdqh, SchedWritePMULLD.XMM>; + defm VPMACSDD : xop4opm2<0x9E, "vpmacsdd", + int_x86_xop_vpmacsdd, SchedWritePMULLD.XMM>; +} + +// IFMA patterns - for cases where we can safely ignore the overflow bits from +// the multiply or easily match with existing intrinsics. +let Predicates = [HasXOP] in { + def : Pat<(v8i16 (add (mul (v8i16 VR128:$src1), (v8i16 VR128:$src2)), + (v8i16 VR128:$src3))), + (VPMACSWWrr VR128:$src1, VR128:$src2, VR128:$src3)>; + def : Pat<(v4i32 (add (mul (v4i32 VR128:$src1), (v4i32 VR128:$src2)), + (v4i32 VR128:$src3))), + (VPMACSDDrr VR128:$src1, VR128:$src2, VR128:$src3)>; + def : Pat<(v2i64 (add (X86pmuldq (bc_v2i64 (X86PShufd (v4i32 VR128:$src1), (i8 -11))), + (bc_v2i64 (X86PShufd (v4i32 VR128:$src2), (i8 -11)))), + (v2i64 VR128:$src3))), + (VPMACSDQHrr VR128:$src1, VR128:$src2, VR128:$src3)>; + def : Pat<(v2i64 (add (X86pmuldq (v2i64 VR128:$src1), (v2i64 VR128:$src2)), + (v2i64 VR128:$src3))), + (VPMACSDQLrr VR128:$src1, VR128:$src2, VR128:$src3)>; + def : Pat<(v4i32 (add (X86vpmaddwd (v8i16 VR128:$src1), (v8i16 VR128:$src2)), + (v4i32 VR128:$src3))), + (VPMADCSWDrr VR128:$src1, VR128:$src2, VR128:$src3)>; +} + +// Transforms to swizzle an immediate to help matching memory operand in first +// operand. +def CommuteVPCOMCC : SDNodeXForm<imm, [{ + uint8_t Imm = N->getZExtValue() & 0x7; + Imm = X86::getSwappedVPCOMImm(Imm); + return getI8Imm(Imm, SDLoc(N)); +}]>; + +// Instruction where second source can be memory, third must be imm8 +multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128, + X86FoldableSchedWrite sched> { + let ExeDomain = SSEPackedInt in { // SSE integer instructions + let isCommutable = 1 in + def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, XOPCC:$cc), + !strconcat("vpcom${cc}", Suffix, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2), + imm:$cc)))]>, + XOP_4V, Sched<[sched]>; + def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2, XOPCC:$cc), + !strconcat("vpcom${cc}", Suffix, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR128:$dst, + (vt128 (OpNode (vt128 VR128:$src1), + (vt128 (bitconvert (loadv2i64 addr:$src2))), + imm:$cc)))]>, + XOP_4V, Sched<[sched.Folded, ReadAfterLd]>; + let isAsmParserOnly = 1, hasSideEffects = 0 in { + def ri_alt : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, u8imm:$src3), + !strconcat("vpcom", Suffix, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, XOP_4V, Sched<[sched]>, NotMemoryFoldable; + let mayLoad = 1 in + def mi_alt : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2, u8imm:$src3), + !strconcat("vpcom", Suffix, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, XOP_4V, Sched<[sched.Folded, ReadAfterLd]>, + NotMemoryFoldable; + } + } + + def : Pat<(OpNode (bitconvert (loadv2i64 addr:$src2)), + (vt128 VR128:$src1), imm:$cc), + (!cast<Instruction>(NAME#"mi") VR128:$src1, addr:$src2, + (CommuteVPCOMCC imm:$cc))>; +} + +defm VPCOMB : xopvpcom<0xCC, "b", X86vpcom, v16i8, SchedWriteVecALU.XMM>; +defm VPCOMW : xopvpcom<0xCD, "w", X86vpcom, v8i16, SchedWriteVecALU.XMM>; +defm VPCOMD : xopvpcom<0xCE, "d", X86vpcom, v4i32, SchedWriteVecALU.XMM>; +defm VPCOMQ : xopvpcom<0xCF, "q", X86vpcom, v2i64, SchedWriteVecALU.XMM>; +defm VPCOMUB : xopvpcom<0xEC, "ub", X86vpcomu, v16i8, SchedWriteVecALU.XMM>; +defm VPCOMUW : xopvpcom<0xED, "uw", X86vpcomu, v8i16, SchedWriteVecALU.XMM>; +defm VPCOMUD : xopvpcom<0xEE, "ud", X86vpcomu, v4i32, SchedWriteVecALU.XMM>; +defm VPCOMUQ : xopvpcom<0xEF, "uq", X86vpcomu, v2i64, SchedWriteVecALU.XMM>; + +multiclass xop4op<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType vt128, X86FoldableSchedWrite sched> { + def rrr : IXOPi8Reg<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2), + (vt128 VR128:$src3))))]>, + XOP_4V, Sched<[sched]>; + def rrm : IXOPi8Reg<opc, MRMSrcMemOp4, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, i128mem:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2), + (vt128 (bitconvert (loadv2i64 addr:$src3))))))]>, + XOP_4V, VEX_W, Sched<[sched.Folded, ReadAfterLd, ReadAfterLd]>; + def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set VR128:$dst, + (v16i8 (OpNode (vt128 VR128:$src1), (vt128 (bitconvert (loadv2i64 addr:$src2))), + (vt128 VR128:$src3))))]>, + XOP_4V, Sched<[sched.Folded, ReadAfterLd, + // 128mem:$src2 + ReadDefault, ReadDefault, ReadDefault, ReadDefault, + ReadDefault, + // VR128:$src3 + ReadAfterLd]>; + // For disassembler + let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in + def rrr_REV : IXOPi8Reg<opc, MRMSrcRegOp4, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, XOP_4V, VEX_W, Sched<[sched]>, FoldGenData<NAME#rrr>; +} + +let ExeDomain = SSEPackedInt in { + defm VPPERM : xop4op<0xA3, "vpperm", X86vpperm, v16i8, + SchedWriteVarShuffle.XMM>; +} + +// Instruction where either second or third source can be memory +multiclass xop4op_int<bits<8> opc, string OpcodeStr, RegisterClass RC, + X86MemOperand x86memop, ValueType VT, + X86FoldableSchedWrite sched> { + def rrr : IXOPi8Reg<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set RC:$dst, (VT (or (and RC:$src3, RC:$src1), + (X86andnp RC:$src3, RC:$src2))))]>, XOP_4V, + Sched<[sched]>; + def rrm : IXOPi8Reg<opc, MRMSrcMemOp4, (outs RC:$dst), + (ins RC:$src1, RC:$src2, x86memop:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set RC:$dst, (VT (or (and (load addr:$src3), RC:$src1), + (X86andnp (load addr:$src3), RC:$src2))))]>, + XOP_4V, VEX_W, Sched<[sched.Folded, ReadAfterLd, ReadAfterLd]>; + def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2, RC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set RC:$dst, (VT (or (and RC:$src3, RC:$src1), + (X86andnp RC:$src3, (load addr:$src2)))))]>, + XOP_4V, Sched<[sched.Folded, ReadAfterLd, + // x86memop:$src2 + ReadDefault, ReadDefault, ReadDefault, ReadDefault, + ReadDefault, + // RC::$src3 + ReadAfterLd]>; + // For disassembler + let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in + def rrr_REV : IXOPi8Reg<opc, MRMSrcRegOp4, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, XOP_4V, VEX_W, Sched<[sched]>, FoldGenData<NAME#rrr>; +} + +let ExeDomain = SSEPackedInt in { + defm VPCMOV : xop4op_int<0xA2, "vpcmov", VR128, i128mem, v2i64, + SchedWriteShuffle.XMM>; + defm VPCMOVY : xop4op_int<0xA2, "vpcmov", VR256, i256mem, v4i64, + SchedWriteShuffle.YMM>, VEX_L; +} + +multiclass xop_vpermil2<bits<8> Opc, string OpcodeStr, RegisterClass RC, + X86MemOperand intmemop, X86MemOperand fpmemop, + ValueType VT, PatFrag FPLdFrag, PatFrag IntLdFrag, + X86FoldableSchedWrite sched> { + def rr : IXOP5<Opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3, u8imm:$src4), + !strconcat(OpcodeStr, + "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), + [(set RC:$dst, + (VT (X86vpermil2 RC:$src1, RC:$src2, RC:$src3, (i8 imm:$src4))))]>, + Sched<[sched]>; + def rm : IXOP5<Opc, MRMSrcMemOp4, (outs RC:$dst), + (ins RC:$src1, RC:$src2, intmemop:$src3, u8imm:$src4), + !strconcat(OpcodeStr, + "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), + [(set RC:$dst, + (VT (X86vpermil2 RC:$src1, RC:$src2, + (bitconvert (IntLdFrag addr:$src3)), + (i8 imm:$src4))))]>, VEX_W, + Sched<[sched.Folded, ReadAfterLd, ReadAfterLd]>; + def mr : IXOP5<Opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, fpmemop:$src2, RC:$src3, u8imm:$src4), + !strconcat(OpcodeStr, + "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), + [(set RC:$dst, + (VT (X86vpermil2 RC:$src1, (FPLdFrag addr:$src2), + RC:$src3, (i8 imm:$src4))))]>, + Sched<[sched.Folded, ReadAfterLd, + // fpmemop:$src2 + ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault, + // RC:$src3 + ReadAfterLd]>; + // For disassembler + let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in + def rr_REV : IXOP5<Opc, MRMSrcRegOp4, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3, u8imm:$src4), + !strconcat(OpcodeStr, + "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), + []>, VEX_W, Sched<[sched]>, FoldGenData<NAME#rr>; +} + +let ExeDomain = SSEPackedDouble in { + defm VPERMIL2PD : xop_vpermil2<0x49, "vpermil2pd", VR128, i128mem, f128mem, + v2f64, loadv2f64, loadv2i64, + SchedWriteFVarShuffle.XMM>; + defm VPERMIL2PDY : xop_vpermil2<0x49, "vpermil2pd", VR256, i256mem, f256mem, + v4f64, loadv4f64, loadv4i64, + SchedWriteFVarShuffle.YMM>, VEX_L; +} + +let ExeDomain = SSEPackedSingle in { + defm VPERMIL2PS : xop_vpermil2<0x48, "vpermil2ps", VR128, i128mem, f128mem, + v4f32, loadv4f32, loadv2i64, + SchedWriteFVarShuffle.XMM>; + defm VPERMIL2PSY : xop_vpermil2<0x48, "vpermil2ps", VR256, i256mem, f256mem, + v8f32, loadv8f32, loadv4i64, + SchedWriteFVarShuffle.YMM>, VEX_L; +} + diff --git a/capstone/suite/synctools/tablegen/X86/back/X86PfmCounters.td b/capstone/suite/synctools/tablegen/X86/back/X86PfmCounters.td new file mode 100644 index 000000000..093fbafa3 --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86PfmCounters.td @@ -0,0 +1,77 @@ +//===-- X86PfmCounters.td - X86 Hardware Counters ----------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This describes the available hardware counters for various subtargets. +// +//===----------------------------------------------------------------------===// + +let SchedModel = SandyBridgeModel in { +def SBCycleCounter : PfmCycleCounter<"unhalted_core_cycles">; +def SBPort0Counter : PfmIssueCounter<SBPort0, ["uops_dispatched_port:port_0"]>; +def SBPort1Counter : PfmIssueCounter<SBPort1, ["uops_dispatched_port:port_1"]>; +def SBPort23Counter : PfmIssueCounter<SBPort23, + ["uops_dispatched_port:port_2", + "uops_dispatched_port:port_3"]>; +def SBPort4Counter : PfmIssueCounter<SBPort4, ["uops_dispatched_port:port_4"]>; +def SBPort5Counter : PfmIssueCounter<SBPort5, ["uops_dispatched_port:port_5"]>; +} + +let SchedModel = HaswellModel in { +def HWCycleCounter : PfmCycleCounter<"unhalted_core_cycles">; +def HWPort0Counter : PfmIssueCounter<HWPort0, ["uops_dispatched_port:port_0"]>; +def HWPort1Counter : PfmIssueCounter<HWPort1, ["uops_dispatched_port:port_1"]>; +def HWPort2Counter : PfmIssueCounter<HWPort2, ["uops_dispatched_port:port_2"]>; +def HWPort3Counter : PfmIssueCounter<HWPort3, ["uops_dispatched_port:port_3"]>; +def HWPort4Counter : PfmIssueCounter<HWPort4, ["uops_dispatched_port:port_4"]>; +def HWPort5Counter : PfmIssueCounter<HWPort5, ["uops_dispatched_port:port_5"]>; +def HWPort6Counter : PfmIssueCounter<HWPort6, ["uops_dispatched_port:port_6"]>; +def HWPort7Counter : PfmIssueCounter<HWPort7, ["uops_dispatched_port:port_7"]>; +} + +let SchedModel = BroadwellModel in { +def BWCycleCounter : PfmCycleCounter<"unhalted_core_cycles">; +def BWPort0Counter : PfmIssueCounter<BWPort0, ["uops_executed_port:port_0"]>; +def BWPort1Counter : PfmIssueCounter<BWPort1, ["uops_executed_port:port_1"]>; +def BWPort2Counter : PfmIssueCounter<BWPort2, ["uops_executed_port:port_2"]>; +def BWPort3Counter : PfmIssueCounter<BWPort3, ["uops_executed_port:port_3"]>; +def BWPort4Counter : PfmIssueCounter<BWPort4, ["uops_executed_port:port_4"]>; +def BWPort5Counter : PfmIssueCounter<BWPort5, ["uops_executed_port:port_5"]>; +def BWPort6Counter : PfmIssueCounter<BWPort6, ["uops_executed_port:port_6"]>; +def BWPort7Counter : PfmIssueCounter<BWPort7, ["uops_executed_port:port_7"]>; +} + +let SchedModel = SkylakeClientModel in { +def SKLCycleCounter : PfmCycleCounter<"unhalted_core_cycles">; +def SKLPort0Counter : PfmIssueCounter<SKLPort0, ["uops_dispatched_port:port_0"]>; +def SKLPort1Counter : PfmIssueCounter<SKLPort1, ["uops_dispatched_port:port_1"]>; +def SKLPort2Counter : PfmIssueCounter<SKLPort2, ["uops_dispatched_port:port_2"]>; +def SKLPort3Counter : PfmIssueCounter<SKLPort3, ["uops_dispatched_port:port_3"]>; +def SKLPort4Counter : PfmIssueCounter<SKLPort4, ["uops_dispatched_port:port_4"]>; +def SKLPort5Counter : PfmIssueCounter<SKLPort5, ["uops_dispatched_port:port_5"]>; +def SKLPort6Counter : PfmIssueCounter<SKLPort6, ["uops_dispatched_port:port_6"]>; +def SKLPort7Counter : PfmIssueCounter<SKLPort7, ["uops_dispatched_port:port_7"]>; +} + +let SchedModel = SkylakeServerModel in { +def SKXCycleCounter : PfmCycleCounter<"unhalted_core_cycles">; +def SKXPort0Counter : PfmIssueCounter<SKXPort0, ["uops_dispatched_port:port_0"]>; +def SKXPort1Counter : PfmIssueCounter<SKXPort1, ["uops_dispatched_port:port_1"]>; +def SKXPort2Counter : PfmIssueCounter<SKXPort2, ["uops_dispatched_port:port_2"]>; +def SKXPort3Counter : PfmIssueCounter<SKXPort3, ["uops_dispatched_port:port_3"]>; +def SKXPort4Counter : PfmIssueCounter<SKXPort4, ["uops_dispatched_port:port_4"]>; +def SKXPort5Counter : PfmIssueCounter<SKXPort5, ["uops_dispatched_port:port_5"]>; +def SKXPort6Counter : PfmIssueCounter<SKXPort6, ["uops_dispatched_port:port_6"]>; +def SKXPort7Counter : PfmIssueCounter<SKXPort7, ["uops_dispatched_port:port_7"]>; +} + +let SchedModel = BtVer2Model in { +def JCycleCounter : PfmCycleCounter<"cpu_clk_unhalted">; +def JFPU0Counter : PfmIssueCounter<JFPU0, ["dispatched_fpu:pipe0"]>; +def JFPU1Counter : PfmIssueCounter<JFPU1, ["dispatched_fpu:pipe1"]>; +} diff --git a/capstone/suite/synctools/tablegen/X86/back/X86RegisterBanks.td b/capstone/suite/synctools/tablegen/X86/back/X86RegisterBanks.td new file mode 100644 index 000000000..6d17cd53a --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86RegisterBanks.td @@ -0,0 +1,17 @@ +//=- X86RegisterBank.td - Describe the AArch64 Banks -----*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +/// General Purpose Registers: RAX, RCX,... +def GPRRegBank : RegisterBank<"GPR", [GR64]>; + +/// Floating Point/Vector Registers +def VECRRegBank : RegisterBank<"VECR", [VR512]>; diff --git a/capstone/suite/synctools/tablegen/X86/back/X86RegisterInfo.td b/capstone/suite/synctools/tablegen/X86/back/X86RegisterInfo.td new file mode 100644 index 000000000..ee9e7891f --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86RegisterInfo.td @@ -0,0 +1,591 @@ +//===- X86RegisterInfo.td - Describe the X86 Register File --*- tablegen -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the X86 Register file, defining the registers themselves, +// aliases between the registers, and the register classes built out of the +// registers. +// +//===----------------------------------------------------------------------===// + +class X86Reg<string n, bits<16> Enc, list<Register> subregs = []> : Register<n> { + let Namespace = "X86"; + let HWEncoding = Enc; + let SubRegs = subregs; +} + +// Subregister indices. +let Namespace = "X86" in { + def sub_8bit : SubRegIndex<8>; + def sub_8bit_hi : SubRegIndex<8, 8>; + def sub_8bit_hi_phony : SubRegIndex<8, 8>; + def sub_16bit : SubRegIndex<16>; + def sub_16bit_hi : SubRegIndex<16, 16>; + def sub_32bit : SubRegIndex<32>; + def sub_xmm : SubRegIndex<128>; + def sub_ymm : SubRegIndex<256>; +} + +//===----------------------------------------------------------------------===// +// Register definitions... +// + +// In the register alias definitions below, we define which registers alias +// which others. We only specify which registers the small registers alias, +// because the register file generator is smart enough to figure out that +// AL aliases AX if we tell it that AX aliased AL (for example). + +// Dwarf numbering is different for 32-bit and 64-bit, and there are +// variations by target as well. Currently the first entry is for X86-64, +// second - for EH on X86-32/Darwin and third is 'generic' one (X86-32/Linux +// and debug information on X86-32/Darwin) + +// 8-bit registers +// Low registers +def AL : X86Reg<"al", 0>; +def DL : X86Reg<"dl", 2>; +def CL : X86Reg<"cl", 1>; +def BL : X86Reg<"bl", 3>; + +// High registers. On x86-64, these cannot be used in any instruction +// with a REX prefix. +def AH : X86Reg<"ah", 4>; +def DH : X86Reg<"dh", 6>; +def CH : X86Reg<"ch", 5>; +def BH : X86Reg<"bh", 7>; + +// X86-64 only, requires REX. +let CostPerUse = 1 in { +def SIL : X86Reg<"sil", 6>; +def DIL : X86Reg<"dil", 7>; +def BPL : X86Reg<"bpl", 5>; +def SPL : X86Reg<"spl", 4>; +def R8B : X86Reg<"r8b", 8>; +def R9B : X86Reg<"r9b", 9>; +def R10B : X86Reg<"r10b", 10>; +def R11B : X86Reg<"r11b", 11>; +def R12B : X86Reg<"r12b", 12>; +def R13B : X86Reg<"r13b", 13>; +def R14B : X86Reg<"r14b", 14>; +def R15B : X86Reg<"r15b", 15>; +} + +let isArtificial = 1 in { +// High byte of the low 16 bits of the super-register: +def SIH : X86Reg<"", -1>; +def DIH : X86Reg<"", -1>; +def BPH : X86Reg<"", -1>; +def SPH : X86Reg<"", -1>; +def R8BH : X86Reg<"", -1>; +def R9BH : X86Reg<"", -1>; +def R10BH : X86Reg<"", -1>; +def R11BH : X86Reg<"", -1>; +def R12BH : X86Reg<"", -1>; +def R13BH : X86Reg<"", -1>; +def R14BH : X86Reg<"", -1>; +def R15BH : X86Reg<"", -1>; +// High word of the low 32 bits of the super-register: +def HAX : X86Reg<"", -1>; +def HDX : X86Reg<"", -1>; +def HCX : X86Reg<"", -1>; +def HBX : X86Reg<"", -1>; +def HSI : X86Reg<"", -1>; +def HDI : X86Reg<"", -1>; +def HBP : X86Reg<"", -1>; +def HSP : X86Reg<"", -1>; +def HIP : X86Reg<"", -1>; +def R8WH : X86Reg<"", -1>; +def R9WH : X86Reg<"", -1>; +def R10WH : X86Reg<"", -1>; +def R11WH : X86Reg<"", -1>; +def R12WH : X86Reg<"", -1>; +def R13WH : X86Reg<"", -1>; +def R14WH : X86Reg<"", -1>; +def R15WH : X86Reg<"", -1>; +} + +// 16-bit registers +let SubRegIndices = [sub_8bit, sub_8bit_hi], CoveredBySubRegs = 1 in { +def AX : X86Reg<"ax", 0, [AL,AH]>; +def DX : X86Reg<"dx", 2, [DL,DH]>; +def CX : X86Reg<"cx", 1, [CL,CH]>; +def BX : X86Reg<"bx", 3, [BL,BH]>; +} +let SubRegIndices = [sub_8bit, sub_8bit_hi_phony], CoveredBySubRegs = 1 in { +def SI : X86Reg<"si", 6, [SIL,SIH]>; +def DI : X86Reg<"di", 7, [DIL,DIH]>; +def BP : X86Reg<"bp", 5, [BPL,BPH]>; +def SP : X86Reg<"sp", 4, [SPL,SPH]>; +} +def IP : X86Reg<"ip", 0>; + +// X86-64 only, requires REX. +let SubRegIndices = [sub_8bit, sub_8bit_hi_phony], CostPerUse = 1, + CoveredBySubRegs = 1 in { +def R8W : X86Reg<"r8w", 8, [R8B,R8BH]>; +def R9W : X86Reg<"r9w", 9, [R9B,R9BH]>; +def R10W : X86Reg<"r10w", 10, [R10B,R10BH]>; +def R11W : X86Reg<"r11w", 11, [R11B,R11BH]>; +def R12W : X86Reg<"r12w", 12, [R12B,R12BH]>; +def R13W : X86Reg<"r13w", 13, [R13B,R13BH]>; +def R14W : X86Reg<"r14w", 14, [R14B,R14BH]>; +def R15W : X86Reg<"r15w", 15, [R15B,R15BH]>; +} + +// 32-bit registers +let SubRegIndices = [sub_16bit, sub_16bit_hi], CoveredBySubRegs = 1 in { +def EAX : X86Reg<"eax", 0, [AX, HAX]>, DwarfRegNum<[-2, 0, 0]>; +def EDX : X86Reg<"edx", 2, [DX, HDX]>, DwarfRegNum<[-2, 2, 2]>; +def ECX : X86Reg<"ecx", 1, [CX, HCX]>, DwarfRegNum<[-2, 1, 1]>; +def EBX : X86Reg<"ebx", 3, [BX, HBX]>, DwarfRegNum<[-2, 3, 3]>; +def ESI : X86Reg<"esi", 6, [SI, HSI]>, DwarfRegNum<[-2, 6, 6]>; +def EDI : X86Reg<"edi", 7, [DI, HDI]>, DwarfRegNum<[-2, 7, 7]>; +def EBP : X86Reg<"ebp", 5, [BP, HBP]>, DwarfRegNum<[-2, 4, 5]>; +def ESP : X86Reg<"esp", 4, [SP, HSP]>, DwarfRegNum<[-2, 5, 4]>; +def EIP : X86Reg<"eip", 0, [IP, HIP]>, DwarfRegNum<[-2, 8, 8]>; +} + +// X86-64 only, requires REX +let SubRegIndices = [sub_16bit, sub_16bit_hi], CostPerUse = 1, + CoveredBySubRegs = 1 in { +def R8D : X86Reg<"r8d", 8, [R8W,R8WH]>; +def R9D : X86Reg<"r9d", 9, [R9W,R9WH]>; +def R10D : X86Reg<"r10d", 10, [R10W,R10WH]>; +def R11D : X86Reg<"r11d", 11, [R11W,R11WH]>; +def R12D : X86Reg<"r12d", 12, [R12W,R12WH]>; +def R13D : X86Reg<"r13d", 13, [R13W,R13WH]>; +def R14D : X86Reg<"r14d", 14, [R14W,R14WH]>; +def R15D : X86Reg<"r15d", 15, [R15W,R15WH]>; +} + +// 64-bit registers, X86-64 only +let SubRegIndices = [sub_32bit] in { +def RAX : X86Reg<"rax", 0, [EAX]>, DwarfRegNum<[0, -2, -2]>; +def RDX : X86Reg<"rdx", 2, [EDX]>, DwarfRegNum<[1, -2, -2]>; +def RCX : X86Reg<"rcx", 1, [ECX]>, DwarfRegNum<[2, -2, -2]>; +def RBX : X86Reg<"rbx", 3, [EBX]>, DwarfRegNum<[3, -2, -2]>; +def RSI : X86Reg<"rsi", 6, [ESI]>, DwarfRegNum<[4, -2, -2]>; +def RDI : X86Reg<"rdi", 7, [EDI]>, DwarfRegNum<[5, -2, -2]>; +def RBP : X86Reg<"rbp", 5, [EBP]>, DwarfRegNum<[6, -2, -2]>; +def RSP : X86Reg<"rsp", 4, [ESP]>, DwarfRegNum<[7, -2, -2]>; + +// These also require REX. +let CostPerUse = 1 in { +def R8 : X86Reg<"r8", 8, [R8D]>, DwarfRegNum<[ 8, -2, -2]>; +def R9 : X86Reg<"r9", 9, [R9D]>, DwarfRegNum<[ 9, -2, -2]>; +def R10 : X86Reg<"r10", 10, [R10D]>, DwarfRegNum<[10, -2, -2]>; +def R11 : X86Reg<"r11", 11, [R11D]>, DwarfRegNum<[11, -2, -2]>; +def R12 : X86Reg<"r12", 12, [R12D]>, DwarfRegNum<[12, -2, -2]>; +def R13 : X86Reg<"r13", 13, [R13D]>, DwarfRegNum<[13, -2, -2]>; +def R14 : X86Reg<"r14", 14, [R14D]>, DwarfRegNum<[14, -2, -2]>; +def R15 : X86Reg<"r15", 15, [R15D]>, DwarfRegNum<[15, -2, -2]>; +def RIP : X86Reg<"rip", 0, [EIP]>, DwarfRegNum<[16, -2, -2]>; +}} + +// MMX Registers. These are actually aliased to ST0 .. ST7 +def MM0 : X86Reg<"mm0", 0>, DwarfRegNum<[41, 29, 29]>; +def MM1 : X86Reg<"mm1", 1>, DwarfRegNum<[42, 30, 30]>; +def MM2 : X86Reg<"mm2", 2>, DwarfRegNum<[43, 31, 31]>; +def MM3 : X86Reg<"mm3", 3>, DwarfRegNum<[44, 32, 32]>; +def MM4 : X86Reg<"mm4", 4>, DwarfRegNum<[45, 33, 33]>; +def MM5 : X86Reg<"mm5", 5>, DwarfRegNum<[46, 34, 34]>; +def MM6 : X86Reg<"mm6", 6>, DwarfRegNum<[47, 35, 35]>; +def MM7 : X86Reg<"mm7", 7>, DwarfRegNum<[48, 36, 36]>; + +// Pseudo Floating Point registers +def FP0 : X86Reg<"fp0", 0>; +def FP1 : X86Reg<"fp1", 0>; +def FP2 : X86Reg<"fp2", 0>; +def FP3 : X86Reg<"fp3", 0>; +def FP4 : X86Reg<"fp4", 0>; +def FP5 : X86Reg<"fp5", 0>; +def FP6 : X86Reg<"fp6", 0>; +def FP7 : X86Reg<"fp7", 0>; + +// XMM Registers, used by the various SSE instruction set extensions. +def XMM0: X86Reg<"xmm0", 0>, DwarfRegNum<[17, 21, 21]>; +def XMM1: X86Reg<"xmm1", 1>, DwarfRegNum<[18, 22, 22]>; +def XMM2: X86Reg<"xmm2", 2>, DwarfRegNum<[19, 23, 23]>; +def XMM3: X86Reg<"xmm3", 3>, DwarfRegNum<[20, 24, 24]>; +def XMM4: X86Reg<"xmm4", 4>, DwarfRegNum<[21, 25, 25]>; +def XMM5: X86Reg<"xmm5", 5>, DwarfRegNum<[22, 26, 26]>; +def XMM6: X86Reg<"xmm6", 6>, DwarfRegNum<[23, 27, 27]>; +def XMM7: X86Reg<"xmm7", 7>, DwarfRegNum<[24, 28, 28]>; + +// X86-64 only +let CostPerUse = 1 in { +def XMM8: X86Reg<"xmm8", 8>, DwarfRegNum<[25, -2, -2]>; +def XMM9: X86Reg<"xmm9", 9>, DwarfRegNum<[26, -2, -2]>; +def XMM10: X86Reg<"xmm10", 10>, DwarfRegNum<[27, -2, -2]>; +def XMM11: X86Reg<"xmm11", 11>, DwarfRegNum<[28, -2, -2]>; +def XMM12: X86Reg<"xmm12", 12>, DwarfRegNum<[29, -2, -2]>; +def XMM13: X86Reg<"xmm13", 13>, DwarfRegNum<[30, -2, -2]>; +def XMM14: X86Reg<"xmm14", 14>, DwarfRegNum<[31, -2, -2]>; +def XMM15: X86Reg<"xmm15", 15>, DwarfRegNum<[32, -2, -2]>; + +def XMM16: X86Reg<"xmm16", 16>, DwarfRegNum<[67, -2, -2]>; +def XMM17: X86Reg<"xmm17", 17>, DwarfRegNum<[68, -2, -2]>; +def XMM18: X86Reg<"xmm18", 18>, DwarfRegNum<[69, -2, -2]>; +def XMM19: X86Reg<"xmm19", 19>, DwarfRegNum<[70, -2, -2]>; +def XMM20: X86Reg<"xmm20", 20>, DwarfRegNum<[71, -2, -2]>; +def XMM21: X86Reg<"xmm21", 21>, DwarfRegNum<[72, -2, -2]>; +def XMM22: X86Reg<"xmm22", 22>, DwarfRegNum<[73, -2, -2]>; +def XMM23: X86Reg<"xmm23", 23>, DwarfRegNum<[74, -2, -2]>; +def XMM24: X86Reg<"xmm24", 24>, DwarfRegNum<[75, -2, -2]>; +def XMM25: X86Reg<"xmm25", 25>, DwarfRegNum<[76, -2, -2]>; +def XMM26: X86Reg<"xmm26", 26>, DwarfRegNum<[77, -2, -2]>; +def XMM27: X86Reg<"xmm27", 27>, DwarfRegNum<[78, -2, -2]>; +def XMM28: X86Reg<"xmm28", 28>, DwarfRegNum<[79, -2, -2]>; +def XMM29: X86Reg<"xmm29", 29>, DwarfRegNum<[80, -2, -2]>; +def XMM30: X86Reg<"xmm30", 30>, DwarfRegNum<[81, -2, -2]>; +def XMM31: X86Reg<"xmm31", 31>, DwarfRegNum<[82, -2, -2]>; + +} // CostPerUse + +// YMM0-15 registers, used by AVX instructions and +// YMM16-31 registers, used by AVX-512 instructions. +let SubRegIndices = [sub_xmm] in { + foreach Index = 0-31 in { + def YMM#Index : X86Reg<"ymm"#Index, Index, [!cast<X86Reg>("XMM"#Index)]>, + DwarfRegAlias<!cast<X86Reg>("XMM"#Index)>; + } +} + +// ZMM Registers, used by AVX-512 instructions. +let SubRegIndices = [sub_ymm] in { + foreach Index = 0-31 in { + def ZMM#Index : X86Reg<"zmm"#Index, Index, [!cast<X86Reg>("YMM"#Index)]>, + DwarfRegAlias<!cast<X86Reg>("XMM"#Index)>; + } +} + +// Mask Registers, used by AVX-512 instructions. +def K0 : X86Reg<"k0", 0>, DwarfRegNum<[118, 93, 93]>; +def K1 : X86Reg<"k1", 1>, DwarfRegNum<[119, 94, 94]>; +def K2 : X86Reg<"k2", 2>, DwarfRegNum<[120, 95, 95]>; +def K3 : X86Reg<"k3", 3>, DwarfRegNum<[121, 96, 96]>; +def K4 : X86Reg<"k4", 4>, DwarfRegNum<[122, 97, 97]>; +def K5 : X86Reg<"k5", 5>, DwarfRegNum<[123, 98, 98]>; +def K6 : X86Reg<"k6", 6>, DwarfRegNum<[124, 99, 99]>; +def K7 : X86Reg<"k7", 7>, DwarfRegNum<[125, 100, 100]>; + +// Floating point stack registers. These don't map one-to-one to the FP +// pseudo registers, but we still mark them as aliasing FP registers. That +// way both kinds can be live without exceeding the stack depth. ST registers +// are only live around inline assembly. +def ST0 : X86Reg<"st(0)", 0>, DwarfRegNum<[33, 12, 11]>; +def ST1 : X86Reg<"st(1)", 1>, DwarfRegNum<[34, 13, 12]>; +def ST2 : X86Reg<"st(2)", 2>, DwarfRegNum<[35, 14, 13]>; +def ST3 : X86Reg<"st(3)", 3>, DwarfRegNum<[36, 15, 14]>; +def ST4 : X86Reg<"st(4)", 4>, DwarfRegNum<[37, 16, 15]>; +def ST5 : X86Reg<"st(5)", 5>, DwarfRegNum<[38, 17, 16]>; +def ST6 : X86Reg<"st(6)", 6>, DwarfRegNum<[39, 18, 17]>; +def ST7 : X86Reg<"st(7)", 7>, DwarfRegNum<[40, 19, 18]>; + +// Floating-point status word +def FPSW : X86Reg<"fpsw", 0>; + +// Status flags register. +// +// Note that some flags that are commonly thought of as part of the status +// flags register are modeled separately. Typically this is due to instructions +// reading and updating those flags independently of all the others. We don't +// want to create false dependencies between these instructions and so we use +// a separate register to model them. +def EFLAGS : X86Reg<"flags", 0>; + +// The direction flag. +def DF : X86Reg<"dirflag", 0>; + + +// Segment registers +def CS : X86Reg<"cs", 1>; +def DS : X86Reg<"ds", 3>; +def SS : X86Reg<"ss", 2>; +def ES : X86Reg<"es", 0>; +def FS : X86Reg<"fs", 4>; +def GS : X86Reg<"gs", 5>; + +// Debug registers +def DR0 : X86Reg<"dr0", 0>; +def DR1 : X86Reg<"dr1", 1>; +def DR2 : X86Reg<"dr2", 2>; +def DR3 : X86Reg<"dr3", 3>; +def DR4 : X86Reg<"dr4", 4>; +def DR5 : X86Reg<"dr5", 5>; +def DR6 : X86Reg<"dr6", 6>; +def DR7 : X86Reg<"dr7", 7>; +def DR8 : X86Reg<"dr8", 8>; +def DR9 : X86Reg<"dr9", 9>; +def DR10 : X86Reg<"dr10", 10>; +def DR11 : X86Reg<"dr11", 11>; +def DR12 : X86Reg<"dr12", 12>; +def DR13 : X86Reg<"dr13", 13>; +def DR14 : X86Reg<"dr14", 14>; +def DR15 : X86Reg<"dr15", 15>; + +// Control registers +def CR0 : X86Reg<"cr0", 0>; +def CR1 : X86Reg<"cr1", 1>; +def CR2 : X86Reg<"cr2", 2>; +def CR3 : X86Reg<"cr3", 3>; +def CR4 : X86Reg<"cr4", 4>; +def CR5 : X86Reg<"cr5", 5>; +def CR6 : X86Reg<"cr6", 6>; +def CR7 : X86Reg<"cr7", 7>; +def CR8 : X86Reg<"cr8", 8>; +def CR9 : X86Reg<"cr9", 9>; +def CR10 : X86Reg<"cr10", 10>; +def CR11 : X86Reg<"cr11", 11>; +def CR12 : X86Reg<"cr12", 12>; +def CR13 : X86Reg<"cr13", 13>; +def CR14 : X86Reg<"cr14", 14>; +def CR15 : X86Reg<"cr15", 15>; + +// Pseudo index registers +def EIZ : X86Reg<"eiz", 4>; +def RIZ : X86Reg<"riz", 4>; + +// Bound registers, used in MPX instructions +def BND0 : X86Reg<"bnd0", 0>; +def BND1 : X86Reg<"bnd1", 1>; +def BND2 : X86Reg<"bnd2", 2>; +def BND3 : X86Reg<"bnd3", 3>; + +// CET registers - Shadow Stack Pointer +def SSP : X86Reg<"ssp", 0>; + +//===----------------------------------------------------------------------===// +// Register Class Definitions... now that we have all of the pieces, define the +// top-level register classes. The order specified in the register list is +// implicitly defined to be the register allocation order. +// + +// List call-clobbered registers before callee-save registers. RBX, RBP, (and +// R12, R13, R14, and R15 for X86-64) are callee-save registers. +// In 64-mode, there are 12 additional i8 registers, SIL, DIL, BPL, SPL, and +// R8B, ... R15B. +// Allocate R12 and R13 last, as these require an extra byte when +// encoded in x86_64 instructions. +// FIXME: Allow AH, CH, DH, BH to be used as general-purpose registers in +// 64-bit mode. The main complication is that they cannot be encoded in an +// instruction requiring a REX prefix, while SIL, DIL, BPL, R8D, etc. +// require a REX prefix. For example, "addb %ah, %dil" and "movzbl %ah, %r8d" +// cannot be encoded. +def GR8 : RegisterClass<"X86", [i8], 8, + (add AL, CL, DL, AH, CH, DH, BL, BH, SIL, DIL, BPL, SPL, + R8B, R9B, R10B, R11B, R14B, R15B, R12B, R13B)> { + let AltOrders = [(sub GR8, AH, BH, CH, DH)]; + let AltOrderSelect = [{ + return MF.getSubtarget<X86Subtarget>().is64Bit(); + }]; +} + +let isAllocatable = 0 in +def GRH8 : RegisterClass<"X86", [i8], 8, + (add SIH, DIH, BPH, SPH, R8BH, R9BH, R10BH, R11BH, + R12BH, R13BH, R14BH, R15BH)>; + +def GR16 : RegisterClass<"X86", [i16], 16, + (add AX, CX, DX, SI, DI, BX, BP, SP, + R8W, R9W, R10W, R11W, R14W, R15W, R12W, R13W)>; + +let isAllocatable = 0 in +def GRH16 : RegisterClass<"X86", [i16], 16, + (add HAX, HCX, HDX, HSI, HDI, HBX, HBP, HSP, HIP, + R8WH, R9WH, R10WH, R11WH, R12WH, R13WH, R14WH, + R15WH)>; + +def GR32 : RegisterClass<"X86", [i32], 32, + (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP, + R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D)>; + +// GR64 - 64-bit GPRs. This oddly includes RIP, which isn't accurate, since +// RIP isn't really a register and it can't be used anywhere except in an +// address, but it doesn't cause trouble. +// FIXME: it *does* cause trouble - CheckBaseRegAndIndexReg() has extra +// tests because of the inclusion of RIP in this register class. +def GR64 : RegisterClass<"X86", [i64], 64, + (add RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11, + RBX, R14, R15, R12, R13, RBP, RSP, RIP)>; + +// Segment registers for use by MOV instructions (and others) that have a +// segment register as one operand. Always contain a 16-bit segment +// descriptor. +def SEGMENT_REG : RegisterClass<"X86", [i16], 16, (add CS, DS, SS, ES, FS, GS)>; + +// Debug registers. +def DEBUG_REG : RegisterClass<"X86", [i32], 32, (sequence "DR%u", 0, 15)>; + +// Control registers. +def CONTROL_REG : RegisterClass<"X86", [i64], 64, (sequence "CR%u", 0, 15)>; + +// GR8_ABCD_L, GR8_ABCD_H, GR16_ABCD, GR32_ABCD, GR64_ABCD - Subclasses of +// GR8, GR16, GR32, and GR64 which contain just the "a" "b", "c", and "d" +// registers. On x86-32, GR16_ABCD and GR32_ABCD are classes for registers +// that support 8-bit subreg operations. On x86-64, GR16_ABCD, GR32_ABCD, +// and GR64_ABCD are classes for registers that support 8-bit h-register +// operations. +def GR8_ABCD_L : RegisterClass<"X86", [i8], 8, (add AL, CL, DL, BL)>; +def GR8_ABCD_H : RegisterClass<"X86", [i8], 8, (add AH, CH, DH, BH)>; +def GR16_ABCD : RegisterClass<"X86", [i16], 16, (add AX, CX, DX, BX)>; +def GR32_ABCD : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX, EBX)>; +def GR64_ABCD : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RBX)>; +def GR32_TC : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX)>; +def GR64_TC : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RSI, RDI, + R8, R9, R11, RIP)>; +def GR64_TCW64 : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, + R8, R9, R10, R11, RIP)>; + +// GR8_NOREX - GR8 registers which do not require a REX prefix. +def GR8_NOREX : RegisterClass<"X86", [i8], 8, + (add AL, CL, DL, AH, CH, DH, BL, BH)> { + let AltOrders = [(sub GR8_NOREX, AH, BH, CH, DH)]; + let AltOrderSelect = [{ + return MF.getSubtarget<X86Subtarget>().is64Bit(); + }]; +} +// GR16_NOREX - GR16 registers which do not require a REX prefix. +def GR16_NOREX : RegisterClass<"X86", [i16], 16, + (add AX, CX, DX, SI, DI, BX, BP, SP)>; +// GR32_NOREX - GR32 registers which do not require a REX prefix. +def GR32_NOREX : RegisterClass<"X86", [i32], 32, + (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP)>; +// GR64_NOREX - GR64 registers which do not require a REX prefix. +def GR64_NOREX : RegisterClass<"X86", [i64], 64, + (add RAX, RCX, RDX, RSI, RDI, RBX, RBP, RSP, RIP)>; + +// GR32_NOSP - GR32 registers except ESP. +def GR32_NOSP : RegisterClass<"X86", [i32], 32, (sub GR32, ESP)>; + +// GR64_NOSP - GR64 registers except RSP (and RIP). +def GR64_NOSP : RegisterClass<"X86", [i64], 64, (sub GR64, RSP, RIP)>; + +// GR32_NOREX_NOSP - GR32 registers which do not require a REX prefix except +// ESP. +def GR32_NOREX_NOSP : RegisterClass<"X86", [i32], 32, + (and GR32_NOREX, GR32_NOSP)>; + +// GR64_NOREX_NOSP - GR64_NOREX registers except RSP. +def GR64_NOREX_NOSP : RegisterClass<"X86", [i64], 64, + (and GR64_NOREX, GR64_NOSP)>; + +// Register classes used for ABIs that use 32-bit address accesses, +// while using the whole x84_64 ISA. + +// In such cases, it is fine to use RIP as we are sure the 32 high +// bits are not set. We do not need variants for NOSP as RIP is not +// allowed there. +// RIP is not spilled anywhere for now, so stick to 32-bit alignment +// to save on memory space. +// FIXME: We could allow all 64bit registers, but we would need +// something to check that the 32 high bits are not set, +// which we do not have right now. +def LOW32_ADDR_ACCESS : RegisterClass<"X86", [i32], 32, (add GR32, RIP)>; + +// When RBP is used as a base pointer in a 32-bit addresses environement, +// this is also safe to use the full register to access addresses. +// Since RBP will never be spilled, stick to a 32 alignment to save +// on memory consumption. +def LOW32_ADDR_ACCESS_RBP : RegisterClass<"X86", [i32], 32, + (add LOW32_ADDR_ACCESS, RBP)>; + +// A class to support the 'A' assembler constraint: [ER]AX then [ER]DX. +def GR32_AD : RegisterClass<"X86", [i32], 32, (add EAX, EDX)>; +def GR64_AD : RegisterClass<"X86", [i64], 64, (add RAX, RDX)>; + +// Scalar SSE2 floating point registers. +def FR32 : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 15)>; + +def FR64 : RegisterClass<"X86", [f64], 64, (add FR32)>; + + +// FIXME: This sets up the floating point register files as though they are f64 +// values, though they really are f80 values. This will cause us to spill +// values as 64-bit quantities instead of 80-bit quantities, which is much much +// faster on common hardware. In reality, this should be controlled by a +// command line option or something. + +def RFP32 : RegisterClass<"X86",[f32], 32, (sequence "FP%u", 0, 6)>; +def RFP64 : RegisterClass<"X86",[f64], 32, (add RFP32)>; +def RFP80 : RegisterClass<"X86",[f80], 32, (add RFP32)>; + +// Floating point stack registers (these are not allocatable by the +// register allocator - the floating point stackifier is responsible +// for transforming FPn allocations to STn registers) +def RST : RegisterClass<"X86", [f80, f64, f32], 32, (sequence "ST%u", 0, 7)> { + let isAllocatable = 0; +} + +// Generic vector registers: VR64 and VR128. +// Ensure that float types are declared first - only float is legal on SSE1. +def VR64: RegisterClass<"X86", [x86mmx], 64, (sequence "MM%u", 0, 7)>; +def VR128 : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64, f128], + 128, (add FR32)>; +def VR256 : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64], + 256, (sequence "YMM%u", 0, 15)>; + +// Special classes that help the assembly parser choose some alternate +// instructions to favor 2-byte VEX encodings. +def VR128L : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64, f128], + 128, (sequence "XMM%u", 0, 7)>; +def VR128H : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64, f128], + 128, (sequence "XMM%u", 8, 15)>; +def VR256L : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64], + 256, (sequence "YMM%u", 0, 7)>; +def VR256H : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64], + 256, (sequence "YMM%u", 8, 15)>; + +// Status flags registers. +def CCR : RegisterClass<"X86", [i32], 32, (add EFLAGS)> { + let CopyCost = -1; // Don't allow copying of status registers. + let isAllocatable = 0; +} +def FPCCR : RegisterClass<"X86", [i16], 16, (add FPSW)> { + let CopyCost = -1; // Don't allow copying of status registers. + let isAllocatable = 0; +} +def DFCCR : RegisterClass<"X86", [i32], 32, (add DF)> { + let CopyCost = -1; // Don't allow copying of status registers. + let isAllocatable = 0; +} + +// AVX-512 vector/mask registers. +def VR512 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i64], + 512, (sequence "ZMM%u", 0, 31)>; + +// Scalar AVX-512 floating point registers. +def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>; + +def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)>; + +// Extended VR128 and VR256 for AVX-512 instructions +def VR128X : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64, f128], + 128, (add FR32X)>; +def VR256X : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64], + 256, (sequence "YMM%u", 0, 31)>; + +// Mask registers +def VK1 : RegisterClass<"X86", [v1i1], 16, (sequence "K%u", 0, 7)> {let Size = 16;} +def VK2 : RegisterClass<"X86", [v2i1], 16, (add VK1)> {let Size = 16;} +def VK4 : RegisterClass<"X86", [v4i1], 16, (add VK2)> {let Size = 16;} +def VK8 : RegisterClass<"X86", [v8i1], 16, (add VK4)> {let Size = 16;} +def VK16 : RegisterClass<"X86", [v16i1], 16, (add VK8)> {let Size = 16;} +def VK32 : RegisterClass<"X86", [v32i1], 32, (add VK16)> {let Size = 32;} +def VK64 : RegisterClass<"X86", [v64i1], 64, (add VK32)> {let Size = 64;} + +def VK1WM : RegisterClass<"X86", [v1i1], 16, (sub VK1, K0)> {let Size = 16;} +def VK2WM : RegisterClass<"X86", [v2i1], 16, (sub VK2, K0)> {let Size = 16;} +def VK4WM : RegisterClass<"X86", [v4i1], 16, (sub VK4, K0)> {let Size = 16;} +def VK8WM : RegisterClass<"X86", [v8i1], 16, (sub VK8, K0)> {let Size = 16;} +def VK16WM : RegisterClass<"X86", [v16i1], 16, (add VK8WM)> {let Size = 16;} +def VK32WM : RegisterClass<"X86", [v32i1], 32, (add VK16WM)> {let Size = 32;} +def VK64WM : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;} + +// Bound registers +def BNDR : RegisterClass<"X86", [v2i64], 128, (sequence "BND%u", 0, 3)>; diff --git a/capstone/suite/synctools/tablegen/X86/back/X86SchedBroadwell.td b/capstone/suite/synctools/tablegen/X86/back/X86SchedBroadwell.td new file mode 100755 index 000000000..6334d9e89 --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86SchedBroadwell.td @@ -0,0 +1,1692 @@ +//=- X86SchedBroadwell.td - X86 Broadwell Scheduling ---------*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for Broadwell to support instruction +// scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +def BroadwellModel : SchedMachineModel { + // All x86 instructions are modeled as a single micro-op, and BW can decode 4 + // instructions per cycle. + let IssueWidth = 4; + let MicroOpBufferSize = 192; // Based on the reorder buffer. + let LoadLatency = 5; + let MispredictPenalty = 16; + + // Based on the LSD (loop-stream detector) queue size and benchmarking data. + let LoopMicroOpBufferSize = 50; + + // This flag is set to allow the scheduler to assign a default model to + // unrecognized opcodes. + let CompleteModel = 0; +} + +let SchedModel = BroadwellModel in { + +// Broadwell can issue micro-ops to 8 different ports in one cycle. + +// Ports 0, 1, 5, and 6 handle all computation. +// Port 4 gets the data half of stores. Store data can be available later than +// the store address, but since we don't model the latency of stores, we can +// ignore that. +// Ports 2 and 3 are identical. They handle loads and the address half of +// stores. Port 7 can handle address calculations. +def BWPort0 : ProcResource<1>; +def BWPort1 : ProcResource<1>; +def BWPort2 : ProcResource<1>; +def BWPort3 : ProcResource<1>; +def BWPort4 : ProcResource<1>; +def BWPort5 : ProcResource<1>; +def BWPort6 : ProcResource<1>; +def BWPort7 : ProcResource<1>; + +// Many micro-ops are capable of issuing on multiple ports. +def BWPort01 : ProcResGroup<[BWPort0, BWPort1]>; +def BWPort23 : ProcResGroup<[BWPort2, BWPort3]>; +def BWPort237 : ProcResGroup<[BWPort2, BWPort3, BWPort7]>; +def BWPort04 : ProcResGroup<[BWPort0, BWPort4]>; +def BWPort05 : ProcResGroup<[BWPort0, BWPort5]>; +def BWPort06 : ProcResGroup<[BWPort0, BWPort6]>; +def BWPort15 : ProcResGroup<[BWPort1, BWPort5]>; +def BWPort16 : ProcResGroup<[BWPort1, BWPort6]>; +def BWPort56 : ProcResGroup<[BWPort5, BWPort6]>; +def BWPort015 : ProcResGroup<[BWPort0, BWPort1, BWPort5]>; +def BWPort056 : ProcResGroup<[BWPort0, BWPort5, BWPort6]>; +def BWPort0156: ProcResGroup<[BWPort0, BWPort1, BWPort5, BWPort6]>; + +// 60 Entry Unified Scheduler +def BWPortAny : ProcResGroup<[BWPort0, BWPort1, BWPort2, BWPort3, BWPort4, + BWPort5, BWPort6, BWPort7]> { + let BufferSize=60; +} + +// Integer division issued on port 0. +def BWDivider : ProcResource<1>; +// FP division and sqrt on port 0. +def BWFPDivider : ProcResource<1>; + +// Loads are 5 cycles, so ReadAfterLd registers needn't be available until 5 +// cycles after the memory operand. +def : ReadAdvance<ReadAfterLd, 5>; + +// Many SchedWrites are defined in pairs with and without a folded load. +// Instructions with folded loads are usually micro-fused, so they only appear +// as two micro-ops when queued in the reservation station. +// This multiclass defines the resource usage for variants with and without +// folded loads. +multiclass BWWriteResPair<X86FoldableSchedWrite SchedRW, + list<ProcResourceKind> ExePorts, + int Lat, list<int> Res = [1], int UOps = 1, + int LoadLat = 5> { + // Register variant is using a single cycle on ExePort. + def : WriteRes<SchedRW, ExePorts> { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } + + // Memory variant also uses a cycle on port 2/3 and adds LoadLat cycles to + // the latency (default = 5). + def : WriteRes<SchedRW.Folded, !listconcat([BWPort23], ExePorts)> { + let Latency = !add(Lat, LoadLat); + let ResourceCycles = !listconcat([1], Res); + let NumMicroOps = !add(UOps, 1); + } +} + +// A folded store needs a cycle on port 4 for the store data, and an extra port +// 2/3/7 cycle to recompute the address. +def : WriteRes<WriteRMW, [BWPort237,BWPort4]>; + +// Arithmetic. +defm : BWWriteResPair<WriteALU, [BWPort0156], 1>; // Simple integer ALU op. +defm : BWWriteResPair<WriteADC, [BWPort06], 1>; // Integer ALU + flags op. +defm : BWWriteResPair<WriteIMul, [BWPort1], 3>; // Integer multiplication. +defm : BWWriteResPair<WriteIMul64, [BWPort1], 3>; // Integer 64-bit multiplication. +defm : BWWriteResPair<WriteDiv8, [BWPort0, BWDivider], 25, [1, 10]>; +defm : BWWriteResPair<WriteDiv16, [BWPort0, BWDivider], 25, [1, 10]>; +defm : BWWriteResPair<WriteDiv32, [BWPort0, BWDivider], 25, [1, 10]>; +defm : BWWriteResPair<WriteDiv64, [BWPort0, BWDivider], 25, [1, 10]>; +defm : BWWriteResPair<WriteIDiv8, [BWPort0, BWDivider], 25, [1, 10]>; +defm : BWWriteResPair<WriteIDiv16, [BWPort0, BWDivider], 25, [1, 10]>; +defm : BWWriteResPair<WriteIDiv32, [BWPort0, BWDivider], 25, [1, 10]>; +defm : BWWriteResPair<WriteIDiv64, [BWPort0, BWDivider], 25, [1, 10]>; + +defm : X86WriteRes<WriteBSWAP32, [BWPort15], 1, [1], 1>; +defm : X86WriteRes<WriteBSWAP64, [BWPort06, BWPort15], 2, [1, 1], 2>; + +defm : BWWriteResPair<WriteCRC32, [BWPort1], 3>; +def : WriteRes<WriteIMulH, []> { let Latency = 3; } // Integer multiplication, high part. + +def : WriteRes<WriteLEA, [BWPort15]>; // LEA instructions can't fold loads. + +defm : BWWriteResPair<WriteCMOV, [BWPort06], 1>; // Conditional move. +defm : BWWriteResPair<WriteCMOV2, [BWPort06,BWPort0156], 2, [1,1], 2>; // // Conditional (CF + ZF flag) move. +defm : X86WriteRes<WriteFCMOV, [BWPort1], 3, [1], 1>; // x87 conditional move. + +def : WriteRes<WriteSETCC, [BWPort06]>; // Setcc. +def : WriteRes<WriteSETCCStore, [BWPort06,BWPort4,BWPort237]> { + let Latency = 2; + let NumMicroOps = 3; +} +def : WriteRes<WriteLAHFSAHF, [BWPort06]>; +def : WriteRes<WriteBitTest,[BWPort06]>; // Bit Test instrs + +// Bit counts. +defm : BWWriteResPair<WriteBSF, [BWPort1], 3>; +defm : BWWriteResPair<WriteBSR, [BWPort1], 3>; +defm : BWWriteResPair<WriteLZCNT, [BWPort1], 3>; +defm : BWWriteResPair<WriteTZCNT, [BWPort1], 3>; +defm : BWWriteResPair<WritePOPCNT, [BWPort1], 3>; + +// Integer shifts and rotates. +defm : BWWriteResPair<WriteShift, [BWPort06], 1>; + +// SHLD/SHRD. +defm : X86WriteRes<WriteSHDrri, [BWPort1], 3, [1], 1>; +defm : X86WriteRes<WriteSHDrrcl,[BWPort1,BWPort06,BWPort0156], 6, [1, 1, 2], 4>; +defm : X86WriteRes<WriteSHDmri, [BWPort1,BWPort23,BWPort237,BWPort0156], 9, [1, 1, 1, 1], 4>; +defm : X86WriteRes<WriteSHDmrcl,[BWPort1,BWPort23,BWPort237,BWPort06,BWPort0156], 11, [1, 1, 1, 1, 2], 6>; + +// BMI1 BEXTR, BMI2 BZHI +defm : BWWriteResPair<WriteBEXTR, [BWPort06,BWPort15], 2, [1,1], 2>; +defm : BWWriteResPair<WriteBZHI, [BWPort15], 1>; + +// Loads, stores, and moves, not folded with other operations. +defm : X86WriteRes<WriteLoad, [BWPort23], 5, [1], 1>; +defm : X86WriteRes<WriteStore, [BWPort237, BWPort4], 1, [1,1], 1>; +defm : X86WriteRes<WriteStoreNT, [BWPort237, BWPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteMove, [BWPort0156], 1, [1], 1>; + +// Idioms that clear a register, like xorps %xmm0, %xmm0. +// These can often bypass execution ports completely. +def : WriteRes<WriteZero, []>; + +// Treat misc copies as a move. +def : InstRW<[WriteMove], (instrs COPY)>; + +// Branches don't produce values, so they have no latency, but they still +// consume resources. Indirect branches can fold loads. +defm : BWWriteResPair<WriteJump, [BWPort06], 1>; + +// Floating point. This covers both scalar and vector operations. +defm : X86WriteRes<WriteFLD0, [BWPort01], 1, [1], 1>; +defm : X86WriteRes<WriteFLD1, [BWPort01], 1, [2], 2>; +defm : X86WriteRes<WriteFLDC, [BWPort01], 1, [2], 2>; +defm : X86WriteRes<WriteFLoad, [BWPort23], 5, [1], 1>; +defm : X86WriteRes<WriteFLoadX, [BWPort23], 5, [1], 1>; +defm : X86WriteRes<WriteFLoadY, [BWPort23], 6, [1], 1>; +defm : X86WriteRes<WriteFMaskedLoad, [BWPort23,BWPort5], 7, [1,2], 3>; +defm : X86WriteRes<WriteFMaskedLoadY, [BWPort23,BWPort5], 8, [1,2], 3>; +defm : X86WriteRes<WriteFStore, [BWPort237,BWPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteFStoreX, [BWPort237,BWPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteFStoreY, [BWPort237,BWPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteFStoreNT, [BWPort237,BWPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteFStoreNTX, [BWPort237,BWPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteFStoreNTY, [BWPort237,BWPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteFMaskedStore, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>; +defm : X86WriteRes<WriteFMaskedStoreY, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>; +defm : X86WriteRes<WriteFMove, [BWPort5], 1, [1], 1>; +defm : X86WriteRes<WriteFMoveX, [BWPort5], 1, [1], 1>; +defm : X86WriteRes<WriteFMoveY, [BWPort5], 1, [1], 1>; + +defm : BWWriteResPair<WriteFAdd, [BWPort1], 3, [1], 1, 5>; // Floating point add/sub. +defm : BWWriteResPair<WriteFAddX, [BWPort1], 3, [1], 1, 5>; // Floating point add/sub (XMM). +defm : BWWriteResPair<WriteFAddY, [BWPort1], 3, [1], 1, 6>; // Floating point add/sub (YMM/ZMM). +defm : X86WriteResPairUnsupported<WriteFAddZ>; +defm : BWWriteResPair<WriteFAdd64, [BWPort1], 3, [1], 1, 5>; // Floating point double add/sub. +defm : BWWriteResPair<WriteFAdd64X, [BWPort1], 3, [1], 1, 5>; // Floating point double add/sub (XMM). +defm : BWWriteResPair<WriteFAdd64Y, [BWPort1], 3, [1], 1, 6>; // Floating point double add/sub (YMM/ZMM). +defm : X86WriteResPairUnsupported<WriteFAdd64Z>; + +defm : BWWriteResPair<WriteFCmp, [BWPort1], 3, [1], 1, 5>; // Floating point compare. +defm : BWWriteResPair<WriteFCmpX, [BWPort1], 3, [1], 1, 5>; // Floating point compare (XMM). +defm : BWWriteResPair<WriteFCmpY, [BWPort1], 3, [1], 1, 6>; // Floating point compare (YMM/ZMM). +defm : X86WriteResPairUnsupported<WriteFCmpZ>; +defm : BWWriteResPair<WriteFCmp64, [BWPort1], 3, [1], 1, 5>; // Floating point double compare. +defm : BWWriteResPair<WriteFCmp64X, [BWPort1], 3, [1], 1, 5>; // Floating point double compare (XMM). +defm : BWWriteResPair<WriteFCmp64Y, [BWPort1], 3, [1], 1, 6>; // Floating point double compare (YMM/ZMM). +defm : X86WriteResPairUnsupported<WriteFCmp64Z>; + +defm : BWWriteResPair<WriteFCom, [BWPort1], 3>; // Floating point compare to flags. + +defm : BWWriteResPair<WriteFMul, [BWPort01], 3, [1], 1, 5>; // Floating point multiplication. +defm : BWWriteResPair<WriteFMulX, [BWPort01], 3, [1], 1, 5>; // Floating point multiplication (XMM). +defm : BWWriteResPair<WriteFMulY, [BWPort01], 3, [1], 1, 6>; // Floating point multiplication (YMM/ZMM). +defm : X86WriteResPairUnsupported<WriteFMulZ>; +defm : BWWriteResPair<WriteFMul64, [BWPort01], 3, [1], 1, 5>; // Floating point double multiplication. +defm : BWWriteResPair<WriteFMul64X, [BWPort01], 3, [1], 1, 5>; // Floating point double multiplication (XMM). +defm : BWWriteResPair<WriteFMul64Y, [BWPort01], 3, [1], 1, 6>; // Floating point double multiplication (YMM/ZMM). +defm : X86WriteResPairUnsupported<WriteFMul64Z>; + +//defm : BWWriteResPair<WriteFDiv, [BWPort0,BWFPDivider], 11, [1,3], 1, 5>; // Floating point division. +defm : BWWriteResPair<WriteFDivX, [BWPort0,BWFPDivider], 11, [1,5], 1, 5>; // Floating point division (XMM). +defm : BWWriteResPair<WriteFDivY, [BWPort0,BWPort015,BWFPDivider], 17, [2,1,10], 3, 6>; // Floating point division (YMM). +defm : X86WriteResPairUnsupported<WriteFDivZ>; +//defm : BWWriteResPair<WriteFDiv64, [BWPort0,BWFPDivider], 14, [1,8], 1, 5>; // Floating point division. +defm : BWWriteResPair<WriteFDiv64X, [BWPort0,BWFPDivider], 14, [1,8], 1, 5>; // Floating point division (XMM). +defm : BWWriteResPair<WriteFDiv64Y, [BWPort0,BWPort015,BWFPDivider], 23, [2,1,16], 3, 6>; // Floating point division (YMM). +defm : X86WriteResPairUnsupported<WriteFDiv64Z>; + +defm : X86WriteRes<WriteFSqrt, [BWPort0,BWFPDivider], 11, [1,4], 1>; // Floating point square root. +defm : X86WriteRes<WriteFSqrtLd, [BWPort0,BWPort23,BWFPDivider], 16, [1,1,7], 2>; +defm : BWWriteResPair<WriteFSqrtX, [BWPort0,BWFPDivider], 11, [1,7], 1, 5>; // Floating point square root (XMM). +defm : BWWriteResPair<WriteFSqrtY, [BWPort0,BWPort015,BWFPDivider], 21, [2,1,14], 3, 6>; // Floating point square root (YMM). +defm : X86WriteResPairUnsupported<WriteFSqrtZ>; +defm : X86WriteRes<WriteFSqrt64, [BWPort0,BWFPDivider], 16, [1,8], 1>; // Floating point double square root. +defm : X86WriteRes<WriteFSqrt64Ld, [BWPort0,BWPort23,BWFPDivider], 21, [1,1,14], 2>; +defm : BWWriteResPair<WriteFSqrt64X, [BWPort0,BWFPDivider], 16, [1,14],1, 5>; // Floating point double square root (XMM). +defm : BWWriteResPair<WriteFSqrt64Y, [BWPort0,BWPort015,BWFPDivider], 29, [2,1,28], 3, 6>; // Floating point double square root (YMM). +defm : X86WriteResPairUnsupported<WriteFSqrt64Z>; +defm : BWWriteResPair<WriteFSqrt80, [BWPort0,BWFPDivider], 23, [1,9]>; // Floating point long double square root. + +defm : BWWriteResPair<WriteFRcp, [BWPort0], 5, [1], 1, 5>; // Floating point reciprocal estimate. +defm : BWWriteResPair<WriteFRcpX, [BWPort0], 5, [1], 1, 5>; // Floating point reciprocal estimate (XMM). +defm : BWWriteResPair<WriteFRcpY, [BWPort0,BWPort015], 11, [2,1], 3, 6>; // Floating point reciprocal estimate (YMM/ZMM). +defm : X86WriteResPairUnsupported<WriteFRcpZ>; + +defm : BWWriteResPair<WriteFRsqrt, [BWPort0], 5, [1], 1, 5>; // Floating point reciprocal square root estimate. +defm : BWWriteResPair<WriteFRsqrtX,[BWPort0], 5, [1], 1, 5>; // Floating point reciprocal square root estimate (XMM). +defm : BWWriteResPair<WriteFRsqrtY,[BWPort0,BWPort015], 11, [2,1], 3, 6>; // Floating point reciprocal square root estimate (YMM/ZMM). +defm : X86WriteResPairUnsupported<WriteFRsqrtZ>; + +defm : BWWriteResPair<WriteFMA, [BWPort01], 5, [1], 1, 5>; // Fused Multiply Add. +defm : BWWriteResPair<WriteFMAX, [BWPort01], 5, [1], 1, 5>; // Fused Multiply Add (XMM). +defm : BWWriteResPair<WriteFMAY, [BWPort01], 5, [1], 1, 6>; // Fused Multiply Add (YMM/ZMM). +defm : X86WriteResPairUnsupported<WriteFMAZ>; +defm : BWWriteResPair<WriteDPPD, [BWPort0,BWPort1,BWPort5], 9, [1,1,1], 3, 5>; // Floating point double dot product. +defm : BWWriteResPair<WriteDPPS, [BWPort0,BWPort1,BWPort5], 14, [2,1,1], 4, 5>; // Floating point single dot product. +defm : BWWriteResPair<WriteDPPSY, [BWPort0,BWPort1,BWPort5], 14, [2,1,1], 4, 6>; // Floating point single dot product (YMM). +defm : X86WriteResPairUnsupported<WriteDPPSZ>; +defm : BWWriteResPair<WriteFSign, [BWPort5], 1>; // Floating point fabs/fchs. +defm : X86WriteRes<WriteFRnd, [BWPort23], 6, [1], 1>; // Floating point rounding. +defm : X86WriteRes<WriteFRndY, [BWPort23], 6, [1], 1>; // Floating point rounding (YMM/ZMM). +defm : X86WriteResPairUnsupported<WriteFRndZ>; +defm : X86WriteRes<WriteFRndLd, [BWPort1,BWPort23], 11, [2,1], 3>; +defm : X86WriteRes<WriteFRndYLd, [BWPort1,BWPort23], 12, [2,1], 3>; +defm : BWWriteResPair<WriteFLogic, [BWPort5], 1, [1], 1, 5>; // Floating point and/or/xor logicals. +defm : BWWriteResPair<WriteFLogicY, [BWPort5], 1, [1], 1, 6>; // Floating point and/or/xor logicals (YMM/ZMM). +defm : X86WriteResPairUnsupported<WriteFLogicZ>; +defm : BWWriteResPair<WriteFTest, [BWPort0], 1, [1], 1, 5>; // Floating point TEST instructions. +defm : BWWriteResPair<WriteFTestY, [BWPort0], 1, [1], 1, 6>; // Floating point TEST instructions (YMM/ZMM). +defm : X86WriteResPairUnsupported<WriteFTestZ>; +defm : BWWriteResPair<WriteFShuffle, [BWPort5], 1, [1], 1, 5>; // Floating point vector shuffles. +defm : BWWriteResPair<WriteFShuffleY, [BWPort5], 1, [1], 1, 6>; // Floating point vector shuffles (YMM/ZMM). +defm : X86WriteResPairUnsupported<WriteFShuffleZ>; +defm : BWWriteResPair<WriteFVarShuffle, [BWPort5], 1, [1], 1, 5>; // Floating point vector variable shuffles. +defm : BWWriteResPair<WriteFVarShuffleY, [BWPort5], 1, [1], 1, 6>; // Floating point vector variable shuffles. +defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>; +defm : BWWriteResPair<WriteFBlend, [BWPort015], 1, [1], 1, 5>; // Floating point vector blends. +defm : BWWriteResPair<WriteFBlendY, [BWPort015], 1, [1], 1, 6>; // Floating point vector blends. +defm : X86WriteResPairUnsupported<WriteFBlendZ>; +defm : BWWriteResPair<WriteFVarBlend, [BWPort5], 2, [2], 2, 5>; // Fp vector variable blends. +defm : BWWriteResPair<WriteFVarBlendY, [BWPort5], 2, [2], 2, 6>; // Fp vector variable blends. +defm : X86WriteResPairUnsupported<WriteFVarBlendZ>; + +// FMA Scheduling helper class. +// class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; } + +// Vector integer operations. +defm : X86WriteRes<WriteVecLoad, [BWPort23], 5, [1], 1>; +defm : X86WriteRes<WriteVecLoadX, [BWPort23], 5, [1], 1>; +defm : X86WriteRes<WriteVecLoadY, [BWPort23], 6, [1], 1>; +defm : X86WriteRes<WriteVecLoadNT, [BWPort23], 5, [1], 1>; +defm : X86WriteRes<WriteVecLoadNTY, [BWPort23], 6, [1], 1>; +defm : X86WriteRes<WriteVecMaskedLoad, [BWPort23,BWPort5], 7, [1,2], 3>; +defm : X86WriteRes<WriteVecMaskedLoadY, [BWPort23,BWPort5], 8, [1,2], 3>; +defm : X86WriteRes<WriteVecStore, [BWPort237,BWPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteVecStoreX, [BWPort237,BWPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteVecStoreY, [BWPort237,BWPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteVecStoreNT, [BWPort237,BWPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteVecStoreNTY, [BWPort237,BWPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteVecMaskedStore, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>; +defm : X86WriteRes<WriteVecMaskedStoreY, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>; +defm : X86WriteRes<WriteVecMove, [BWPort015], 1, [1], 1>; +defm : X86WriteRes<WriteVecMoveX, [BWPort015], 1, [1], 1>; +defm : X86WriteRes<WriteVecMoveY, [BWPort015], 1, [1], 1>; +defm : X86WriteRes<WriteVecMoveToGpr, [BWPort0], 1, [1], 1>; +defm : X86WriteRes<WriteVecMoveFromGpr, [BWPort5], 1, [1], 1>; + +defm : X86WriteRes<WriteEMMS, [BWPort01,BWPort15,BWPort015,BWPort0156], 31, [8,1,21,1], 31>; + +defm : BWWriteResPair<WriteVecALU, [BWPort15], 1, [1], 1, 5>; // Vector integer ALU op, no logicals. +defm : BWWriteResPair<WriteVecALUX, [BWPort15], 1, [1], 1, 5>; // Vector integer ALU op, no logicals. +defm : BWWriteResPair<WriteVecALUY, [BWPort15], 1, [1], 1, 6>; // Vector integer ALU op, no logicals (YMM/ZMM). +defm : X86WriteResPairUnsupported<WriteVecALUZ>; +defm : BWWriteResPair<WriteVecLogic, [BWPort015], 1, [1], 1, 5>; // Vector integer and/or/xor. +defm : BWWriteResPair<WriteVecLogicX,[BWPort015], 1, [1], 1, 5>; // Vector integer and/or/xor. +defm : BWWriteResPair<WriteVecLogicY,[BWPort015], 1, [1], 1, 6>; // Vector integer and/or/xor (YMM/ZMM). +defm : X86WriteResPairUnsupported<WriteVecLogicZ>; +defm : BWWriteResPair<WriteVecTest, [BWPort0,BWPort5], 2, [1,1], 2, 5>; // Vector integer TEST instructions. +defm : BWWriteResPair<WriteVecTestY, [BWPort0,BWPort5], 4, [1,1], 2, 6>; // Vector integer TEST instructions (YMM/ZMM). +defm : X86WriteResPairUnsupported<WriteVecTestZ>; +defm : BWWriteResPair<WriteVecIMul, [BWPort0], 5, [1], 1, 5>; // Vector integer multiply. +defm : BWWriteResPair<WriteVecIMulX, [BWPort0], 5, [1], 1, 5>; // Vector integer multiply. +defm : BWWriteResPair<WriteVecIMulY, [BWPort0], 5, [1], 1, 6>; // Vector integer multiply. +defm : X86WriteResPairUnsupported<WriteVecIMulZ>; +defm : BWWriteResPair<WritePMULLD, [BWPort0], 10, [2], 2, 5>; // Vector PMULLD. +defm : BWWriteResPair<WritePMULLDY, [BWPort0], 10, [2], 2, 6>; // Vector PMULLD (YMM/ZMM). +defm : X86WriteResPairUnsupported<WritePMULLDZ>; +defm : BWWriteResPair<WriteShuffle, [BWPort5], 1, [1], 1, 5>; // Vector shuffles. +defm : BWWriteResPair<WriteShuffleX, [BWPort5], 1, [1], 1, 5>; // Vector shuffles. +defm : BWWriteResPair<WriteShuffleY, [BWPort5], 1, [1], 1, 6>; // Vector shuffles (YMM/ZMM). +defm : X86WriteResPairUnsupported<WriteShuffleZ>; +defm : BWWriteResPair<WriteVarShuffle, [BWPort5], 1, [1], 1, 5>; // Vector variable shuffles. +defm : BWWriteResPair<WriteVarShuffleX,[BWPort5], 1, [1], 1, 5>; // Vector variable shuffles. +defm : BWWriteResPair<WriteVarShuffleY,[BWPort5], 1, [1], 1, 6>; // Vector variable shuffles (YMM/ZMM). +defm : X86WriteResPairUnsupported<WriteVarShuffleZ>; +defm : BWWriteResPair<WriteBlend, [BWPort5], 1, [1], 1, 5>; // Vector blends. +defm : BWWriteResPair<WriteBlendY, [BWPort5], 1, [1], 1, 6>; // Vector blends (YMM/ZMM). +defm : X86WriteResPairUnsupported<WriteBlendZ>; +defm : BWWriteResPair<WriteVarBlend, [BWPort5], 2, [2], 2, 5>; // Vector variable blends. +defm : BWWriteResPair<WriteVarBlendY, [BWPort5], 2, [2], 2, 6>; // Vector variable blends (YMM/ZMM). +defm : X86WriteResPairUnsupported<WriteVarBlendZ>; +defm : BWWriteResPair<WriteMPSAD, [BWPort0, BWPort5], 7, [1, 2], 3, 5>; // Vector MPSAD. +defm : BWWriteResPair<WriteMPSADY, [BWPort0, BWPort5], 7, [1, 2], 3, 6>; // Vector MPSAD. +defm : X86WriteResPairUnsupported<WriteMPSADZ>; +defm : BWWriteResPair<WritePSADBW, [BWPort0], 5, [1], 1, 5>; // Vector PSADBW. +defm : BWWriteResPair<WritePSADBWX, [BWPort0], 5, [1], 1, 5>; // Vector PSADBW. +defm : BWWriteResPair<WritePSADBWY, [BWPort0], 5, [1], 1, 6>; // Vector PSADBW (YMM/ZMM). +defm : X86WriteResPairUnsupported<WritePSADBWZ>; +defm : BWWriteResPair<WritePHMINPOS, [BWPort0], 5>; // Vector PHMINPOS. + +// Vector integer shifts. +defm : BWWriteResPair<WriteVecShift, [BWPort0], 1, [1], 1, 5>; +defm : BWWriteResPair<WriteVecShiftX, [BWPort0,BWPort5], 2, [1,1], 2, 5>; +defm : X86WriteRes<WriteVecShiftY, [BWPort0,BWPort5], 4, [1,1], 2>; +defm : X86WriteRes<WriteVecShiftYLd, [BWPort0,BWPort23], 7, [1,1], 2>; +defm : X86WriteResPairUnsupported<WriteVecShiftZ>; + +defm : BWWriteResPair<WriteVecShiftImm, [BWPort0], 1, [1], 1, 5>; +defm : BWWriteResPair<WriteVecShiftImmX, [BWPort0], 1, [1], 1, 5>; // Vector integer immediate shifts (XMM). +defm : BWWriteResPair<WriteVecShiftImmY, [BWPort0], 1, [1], 1, 6>; // Vector integer immediate shifts (YMM/ZMM). +defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>; +defm : BWWriteResPair<WriteVarVecShift, [BWPort0, BWPort5], 3, [2,1], 3, 5>; // Variable vector shifts. +defm : BWWriteResPair<WriteVarVecShiftY, [BWPort0, BWPort5], 3, [2,1], 3, 6>; // Variable vector shifts (YMM/ZMM). +defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>; + +// Vector insert/extract operations. +def : WriteRes<WriteVecInsert, [BWPort5]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def : WriteRes<WriteVecInsertLd, [BWPort5,BWPort23]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def : WriteRes<WriteVecExtract, [BWPort0,BWPort5]> { + let Latency = 2; + let NumMicroOps = 2; +} +def : WriteRes<WriteVecExtractSt, [BWPort4,BWPort5,BWPort237]> { + let Latency = 2; + let NumMicroOps = 3; +} + +// Conversion between integer and float. +defm : BWWriteResPair<WriteCvtSS2I, [BWPort1], 3>; +defm : BWWriteResPair<WriteCvtPS2I, [BWPort1], 3>; +defm : BWWriteResPair<WriteCvtPS2IY, [BWPort1], 3>; +defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>; +defm : BWWriteResPair<WriteCvtSD2I, [BWPort1], 3>; +defm : BWWriteResPair<WriteCvtPD2I, [BWPort1], 3>; +defm : BWWriteResPair<WriteCvtPD2IY, [BWPort1], 3>; +defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>; + +defm : BWWriteResPair<WriteCvtI2SS, [BWPort1], 4>; +defm : BWWriteResPair<WriteCvtI2PS, [BWPort1], 4>; +defm : BWWriteResPair<WriteCvtI2PSY, [BWPort1], 4>; +defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>; +defm : BWWriteResPair<WriteCvtI2SD, [BWPort1], 4>; +defm : BWWriteResPair<WriteCvtI2PD, [BWPort1], 4>; +defm : BWWriteResPair<WriteCvtI2PDY, [BWPort1], 4>; +defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>; + +defm : BWWriteResPair<WriteCvtSS2SD, [BWPort1], 3>; +defm : BWWriteResPair<WriteCvtPS2PD, [BWPort1], 3>; +defm : BWWriteResPair<WriteCvtPS2PDY, [BWPort1], 3>; +defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>; +defm : BWWriteResPair<WriteCvtSD2SS, [BWPort1], 3>; +defm : BWWriteResPair<WriteCvtPD2PS, [BWPort1], 3>; +defm : BWWriteResPair<WriteCvtPD2PSY, [BWPort1], 3>; +defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>; + +defm : X86WriteRes<WriteCvtPH2PS, [BWPort0,BWPort5], 2, [1,1], 2>; +defm : X86WriteRes<WriteCvtPH2PSY, [BWPort0,BWPort5], 2, [1,1], 2>; +defm : X86WriteResUnsupported<WriteCvtPH2PSZ>; +defm : X86WriteRes<WriteCvtPH2PSLd, [BWPort0,BWPort23], 6, [1,1], 2>; +defm : X86WriteRes<WriteCvtPH2PSYLd, [BWPort0,BWPort23], 6, [1,1], 2>; +defm : X86WriteResUnsupported<WriteCvtPH2PSZLd>; + +defm : X86WriteRes<WriteCvtPS2PH, [BWPort1,BWPort5], 4, [1,1], 2>; +defm : X86WriteRes<WriteCvtPS2PHY, [BWPort1,BWPort5], 6, [1,1], 2>; +defm : X86WriteResUnsupported<WriteCvtPS2PHZ>; +defm : X86WriteRes<WriteCvtPS2PHSt, [BWPort1,BWPort4,BWPort237], 5, [1,1,1], 3>; +defm : X86WriteRes<WriteCvtPS2PHYSt, [BWPort1,BWPort4,BWPort237], 7, [1,1,1], 3>; +defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>; + +// Strings instructions. + +// Packed Compare Implicit Length Strings, Return Mask +def : WriteRes<WritePCmpIStrM, [BWPort0]> { + let Latency = 11; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def : WriteRes<WritePCmpIStrMLd, [BWPort0, BWPort23]> { + let Latency = 16; + let NumMicroOps = 4; + let ResourceCycles = [3,1]; +} + +// Packed Compare Explicit Length Strings, Return Mask +def : WriteRes<WritePCmpEStrM, [BWPort0, BWPort5, BWPort015, BWPort0156]> { + let Latency = 19; + let NumMicroOps = 9; + let ResourceCycles = [4,3,1,1]; +} +def : WriteRes<WritePCmpEStrMLd, [BWPort0, BWPort5, BWPort23, BWPort015, BWPort0156]> { + let Latency = 24; + let NumMicroOps = 10; + let ResourceCycles = [4,3,1,1,1]; +} + +// Packed Compare Implicit Length Strings, Return Index +def : WriteRes<WritePCmpIStrI, [BWPort0]> { + let Latency = 11; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def : WriteRes<WritePCmpIStrILd, [BWPort0, BWPort23]> { + let Latency = 16; + let NumMicroOps = 4; + let ResourceCycles = [3,1]; +} + +// Packed Compare Explicit Length Strings, Return Index +def : WriteRes<WritePCmpEStrI, [BWPort0, BWPort5, BWPort0156]> { + let Latency = 18; + let NumMicroOps = 8; + let ResourceCycles = [4,3,1]; +} +def : WriteRes<WritePCmpEStrILd, [BWPort0, BWPort5, BWPort23, BWPort0156]> { + let Latency = 23; + let NumMicroOps = 9; + let ResourceCycles = [4,3,1,1]; +} + +// MOVMSK Instructions. +def : WriteRes<WriteFMOVMSK, [BWPort0]> { let Latency = 3; } +def : WriteRes<WriteVecMOVMSK, [BWPort0]> { let Latency = 3; } +def : WriteRes<WriteVecMOVMSKY, [BWPort0]> { let Latency = 3; } +def : WriteRes<WriteMMXMOVMSK, [BWPort0]> { let Latency = 1; } + +// AES instructions. +def : WriteRes<WriteAESDecEnc, [BWPort5]> { // Decryption, encryption. + let Latency = 7; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def : WriteRes<WriteAESDecEncLd, [BWPort5, BWPort23]> { + let Latency = 12; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} + +def : WriteRes<WriteAESIMC, [BWPort5]> { // InvMixColumn. + let Latency = 14; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def : WriteRes<WriteAESIMCLd, [BWPort5, BWPort23]> { + let Latency = 19; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} + +def : WriteRes<WriteAESKeyGen, [BWPort0, BWPort5, BWPort015]> { // Key Generation. + let Latency = 29; + let NumMicroOps = 11; + let ResourceCycles = [2,7,2]; +} +def : WriteRes<WriteAESKeyGenLd, [BWPort0, BWPort5, BWPort23, BWPort015]> { + let Latency = 33; + let NumMicroOps = 11; + let ResourceCycles = [2,7,1,1]; +} + +// Carry-less multiplication instructions. +defm : BWWriteResPair<WriteCLMul, [BWPort0], 5>; + +// Catch-all for expensive system instructions. +def : WriteRes<WriteSystem, [BWPort0156]> { let Latency = 100; } // def WriteSystem : SchedWrite; + +// AVX2. +defm : BWWriteResPair<WriteFShuffle256, [BWPort5], 3, [1], 1, 6>; // Fp 256-bit width vector shuffles. +defm : BWWriteResPair<WriteFVarShuffle256, [BWPort5], 3, [1], 1, 6>; // Fp 256-bit width vector variable shuffles. +defm : BWWriteResPair<WriteShuffle256, [BWPort5], 3, [1], 1, 6>; // 256-bit width vector shuffles. +defm : BWWriteResPair<WriteVarShuffle256, [BWPort5], 3, [1], 1, 6>; // 256-bit width vector variable shuffles. + +// Old microcoded instructions that nobody use. +def : WriteRes<WriteMicrocoded, [BWPort0156]> { let Latency = 100; } // def WriteMicrocoded : SchedWrite; + +// Fence instructions. +def : WriteRes<WriteFence, [BWPort23, BWPort4]>; + +// Load/store MXCSR. +def : WriteRes<WriteLDMXCSR, [BWPort0,BWPort23,BWPort0156]> { let Latency = 7; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; } +def : WriteRes<WriteSTMXCSR, [BWPort4,BWPort5,BWPort237]> { let Latency = 2; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; } + +// Nop, not very useful expect it provides a model for nops! +def : WriteRes<WriteNop, []>; + +//////////////////////////////////////////////////////////////////////////////// +// Horizontal add/sub instructions. +//////////////////////////////////////////////////////////////////////////////// + +defm : BWWriteResPair<WriteFHAdd, [BWPort1,BWPort5], 5, [1,2], 3, 5>; +defm : BWWriteResPair<WriteFHAddY, [BWPort1,BWPort5], 5, [1,2], 3, 6>; +defm : BWWriteResPair<WritePHAdd, [BWPort5,BWPort15], 3, [2,1], 3, 5>; +defm : BWWriteResPair<WritePHAddX, [BWPort5,BWPort15], 3, [2,1], 3, 5>; +defm : BWWriteResPair<WritePHAddY, [BWPort5,BWPort15], 3, [2,1], 3, 6>; + +// Remaining instrs. + +def BWWriteResGroup1 : SchedWriteRes<[BWPort0]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[BWWriteResGroup1], (instregex "VPSLLVQ(Y?)rr", + "VPSRLVQ(Y?)rr")>; + +def BWWriteResGroup2 : SchedWriteRes<[BWPort1]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[BWWriteResGroup2], (instregex "COM(P?)_FST0r", + "UCOM_F(P?)r")>; + +def BWWriteResGroup3 : SchedWriteRes<[BWPort5]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[BWWriteResGroup3], (instregex "MMX_MOVQ2DQrr")>; + +def BWWriteResGroup4 : SchedWriteRes<[BWPort6]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[BWWriteResGroup4], (instregex "JMP(16|32|64)r")>; + +def BWWriteResGroup5 : SchedWriteRes<[BWPort01]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[BWWriteResGroup5], (instrs FINCSTP, FNOP)>; + +def BWWriteResGroup6 : SchedWriteRes<[BWPort06]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[BWWriteResGroup6], (instrs CDQ, CQO)>; + +def BWWriteResGroup7 : SchedWriteRes<[BWPort15]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[BWWriteResGroup7], (instregex "ANDN(32|64)rr", + "BLSI(32|64)rr", + "BLSMSK(32|64)rr", + "BLSR(32|64)rr")>; + +def BWWriteResGroup8 : SchedWriteRes<[BWPort015]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[BWWriteResGroup8], (instregex "VPBLENDD(Y?)rri")>; + +def BWWriteResGroup9 : SchedWriteRes<[BWPort0156]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[BWWriteResGroup9], (instregex "SGDT64m", + "SIDT64m", + "SMSW16m", + "STRm", + "SYSCALL")>; + +def BWWriteResGroup10 : SchedWriteRes<[BWPort4,BWPort237]> { + let Latency = 1; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup10], (instregex "FBSTPm", + "ST_FP(32|64|80)m")>; + +def BWWriteResGroup12 : SchedWriteRes<[BWPort01]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[BWWriteResGroup12], (instrs FDECSTP)>; + +def BWWriteResGroup13 : SchedWriteRes<[BWPort06]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[BWWriteResGroup13], (instregex "ROL(8|16|32|64)r1", + "ROL(8|16|32|64)ri", + "ROR(8|16|32|64)r1", + "ROR(8|16|32|64)ri")>; + +def BWWriteResGroup14 : SchedWriteRes<[BWPort0156]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[BWWriteResGroup14], (instrs LFENCE, + MFENCE, + WAIT, + XGETBV)>; + +def BWWriteResGroup15 : SchedWriteRes<[BWPort0,BWPort5]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup15], (instregex "(V?)CVTPS2PDrr", + "(V?)CVTSS2SDrr")>; + +def BWWriteResGroup16 : SchedWriteRes<[BWPort6,BWPort0156]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup16], (instregex "CLFLUSH")>; + +def BWWriteResGroup17 : SchedWriteRes<[BWPort01,BWPort015]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup17], (instregex "MMX_MOVDQ2Qrr")>; + +def BWWriteResGroup18 : SchedWriteRes<[BWPort237,BWPort0156]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup18], (instrs SFENCE)>; + +def BWWriteResGroup20 : SchedWriteRes<[BWPort06,BWPort0156]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup20], (instrs CWD)>; +def: InstRW<[BWWriteResGroup20], (instrs JCXZ, JECXZ, JRCXZ)>; +def: InstRW<[BWWriteResGroup20], (instregex "ADC8i8", + "ADC8ri", + "SBB8i8", + "SBB8ri", + "SET(A|BE)r")>; + +def BWWriteResGroup22 : SchedWriteRes<[BWPort4,BWPort6,BWPort237]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup22], (instrs FNSTCW16m)>; + +def BWWriteResGroup24 : SchedWriteRes<[BWPort4,BWPort237,BWPort15]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup24], (instregex "MOVBE(16|32|64)mr")>; + +def BWWriteResGroup25 : SchedWriteRes<[BWPort4,BWPort237,BWPort0156]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup25], (instrs PUSH16r, PUSH32r, PUSH64r, + STOSB, STOSL, STOSQ, STOSW)>; +def: InstRW<[BWWriteResGroup25], (instregex "PUSH(16|32|64)rmr", + "PUSH64i8")>; + +def BWWriteResGroup27 : SchedWriteRes<[BWPort1]> { + let Latency = 3; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[BWWriteResGroup27], (instregex "MMX_CVTPI2PSirr", + "PDEP(32|64)rr", + "PEXT(32|64)rr", + "(V?)CVTDQ2PS(Y?)rr")>; + +def BWWriteResGroup27_16 : SchedWriteRes<[BWPort1, BWPort0156]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup27_16], (instrs IMUL16rri, IMUL16rri8)>; + +def BWWriteResGroup28 : SchedWriteRes<[BWPort5]> { + let Latency = 3; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[BWWriteResGroup28], (instregex "VPBROADCASTBrr", + "VPBROADCASTWrr")>; + +def BWWriteResGroup30 : SchedWriteRes<[BWPort0156]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def: InstRW<[BWWriteResGroup30], (instrs XADD8rr, XADD16rr, XADD32rr, XADD64rr, + XCHG8rr, XCHG16rr, XCHG32rr, XCHG64rr, + XCHG16ar, XCHG32ar, XCHG64ar)>; + +def BWWriteResGroup33 : SchedWriteRes<[BWPort5,BWPort0156]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[BWWriteResGroup33], (instregex "MMX_PACKSSDWirr", + "MMX_PACKSSWBirr", + "MMX_PACKUSWBirr")>; + +def BWWriteResGroup34 : SchedWriteRes<[BWPort6,BWPort0156]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[BWWriteResGroup34], (instregex "CLD")>; + +def BWWriteResGroup35 : SchedWriteRes<[BWPort06,BWPort0156]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[BWWriteResGroup35], (instregex "RCL(8|16|32|64)r1", + "RCL(8|16|32|64)ri", + "RCR(8|16|32|64)r1", + "RCR(8|16|32|64)ri")>; + +def BWWriteResGroup36 : SchedWriteRes<[BWPort06,BWPort0156]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[BWWriteResGroup36], (instregex "ROL(8|16|32|64)rCL", + "ROR(8|16|32|64)rCL", + "SAR(8|16|32|64)rCL", + "SHL(8|16|32|64)rCL", + "SHR(8|16|32|64)rCL")>; + +def BWWriteResGroup37 : SchedWriteRes<[BWPort4,BWPort6,BWPort237,BWPort0156]> { + let Latency = 3; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[BWWriteResGroup37], (instregex "CALL(16|32|64)r")>; + +def BWWriteResGroup38 : SchedWriteRes<[BWPort4,BWPort237,BWPort06,BWPort0156]> { + let Latency = 3; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[BWWriteResGroup38], (instrs CALL64pcrel32)>; +def: InstRW<[BWWriteResGroup38], (instregex "SET(A|BE)m")>; + +def BWWriteResGroup39 : SchedWriteRes<[BWPort0,BWPort1]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup39], (instregex "(V?)CVT(T?)SD2SI64rr", + "(V?)CVT(T?)SD2SIrr", + "(V?)CVT(T?)SS2SI64rr", + "(V?)CVT(T?)SS2SIrr")>; + +def BWWriteResGroup40 : SchedWriteRes<[BWPort0,BWPort5]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup40], (instregex "VCVTPS2PDYrr")>; + +def BWWriteResGroup41 : SchedWriteRes<[BWPort0,BWPort0156]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup41], (instrs FNSTSW16r)>; + +def BWWriteResGroup42 : SchedWriteRes<[BWPort1,BWPort5]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup42], (instrs IMUL64r, MUL64r, MULX64rr)>; +def: InstRW<[BWWriteResGroup42], (instregex "MMX_CVTPI2PDirr", + "MMX_CVT(T?)PD2PIirr", + "MMX_CVT(T?)PS2PIirr", + "(V?)CVTDQ2PDrr", + "(V?)CVTPD2PSrr", + "(V?)CVTSD2SSrr", + "(V?)CVTSI642SDrr", + "(V?)CVTSI2SDrr", + "(V?)CVTSI2SSrr", + "(V?)CVT(T?)PD2DQrr")>; + +def BWWriteResGroup42_16 : SchedWriteRes<[BWPort1,BWPort06,BWPort0156]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; +} +def: InstRW<[BWWriteResGroup42_16], (instrs IMUL16r, MUL16r)>; + +def BWWriteResGroup43 : SchedWriteRes<[BWPort0,BWPort4,BWPort237]> { + let Latency = 4; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup43], (instrs FNSTSWm)>; + +def BWWriteResGroup44 : SchedWriteRes<[BWPort1,BWPort4,BWPort237]> { + let Latency = 4; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup44], (instregex "IST(T?)_FP(16|32|64)m", + "IST_F(16|32)m")>; + +def BWWriteResGroup45 : SchedWriteRes<[BWPort0156]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [4]; +} +def: InstRW<[BWWriteResGroup45], (instrs FNCLEX)>; + +def BWWriteResGroup46 : SchedWriteRes<[BWPort015,BWPort0156]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [1,3]; +} +def: InstRW<[BWWriteResGroup46], (instrs VZEROUPPER)>; + +def BWWriteResGroup47 : SchedWriteRes<[BWPort0]> { + let Latency = 5; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[BWWriteResGroup47], (instregex "(V?)PCMPGTQ(Y?)rr", + "MUL_(FPrST0|FST0r|FrST0)")>; + +def BWWriteResGroup49 : SchedWriteRes<[BWPort23]> { + let Latency = 5; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[BWWriteResGroup49], (instregex "MOVSX(16|32|64)rm16", + "MOVSX(16|32|64)rm32", + "MOVSX(16|32|64)rm8", + "MOVZX(16|32|64)rm16", + "MOVZX(16|32|64)rm8", + "VBROADCASTSSrm", + "(V?)MOVDDUPrm", + "(V?)MOVSHDUPrm", + "(V?)MOVSLDUPrm", + "VPBROADCASTDrm", + "VPBROADCASTQrm")>; + +def BWWriteResGroup50 : SchedWriteRes<[BWPort1,BWPort5]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[BWWriteResGroup50], (instregex "(V?)CVTSI642SSrr")>; + +def BWWriteResGroup51 : SchedWriteRes<[BWPort1,BWPort6,BWPort06]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup51], (instregex "STR(16|32|64)r")>; + +def BWWriteResGroup52 : SchedWriteRes<[BWPort1,BWPort06,BWPort0156]> { + let Latency = 4; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup52], (instrs IMUL32r, MUL32r, MULX32rr)>; + +def BWWriteResGroup54 : SchedWriteRes<[BWPort6,BWPort0156]> { + let Latency = 5; + let NumMicroOps = 5; + let ResourceCycles = [1,4]; +} +def: InstRW<[BWWriteResGroup54], (instrs PAUSE)>; + +def BWWriteResGroup55 : SchedWriteRes<[BWPort06,BWPort0156]> { + let Latency = 5; + let NumMicroOps = 5; + let ResourceCycles = [1,4]; +} +def: InstRW<[BWWriteResGroup55], (instrs XSETBV)>; + +def BWWriteResGroup56 : SchedWriteRes<[BWPort06,BWPort0156]> { + let Latency = 5; + let NumMicroOps = 5; + let ResourceCycles = [2,3]; +} +def: InstRW<[BWWriteResGroup56], (instregex "CMPXCHG(8|16|32|64)rr")>; + +def BWWriteResGroup57 : SchedWriteRes<[BWPort4,BWPort237,BWPort0156]> { + let Latency = 5; + let NumMicroOps = 6; + let ResourceCycles = [1,1,4]; +} +def: InstRW<[BWWriteResGroup57], (instregex "PUSHF(16|64)")>; + +def BWWriteResGroup58 : SchedWriteRes<[BWPort23]> { + let Latency = 6; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[BWWriteResGroup58], (instregex "LD_F(32|64|80)m", + "VBROADCASTF128", + "VBROADCASTI128", + "VBROADCASTSDYrm", + "VBROADCASTSSYrm", + "VMOVDDUPYrm", + "VMOVSHDUPYrm", + "VMOVSLDUPYrm", + "VPBROADCASTDYrm", + "VPBROADCASTQYrm")>; + +def BWWriteResGroup59 : SchedWriteRes<[BWPort0,BWPort23]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup59], (instregex "(V?)CVTPS2PDrm", + "(V?)CVTSS2SDrm", + "VPSLLVQrm", + "VPSRLVQrm")>; + +def BWWriteResGroup60 : SchedWriteRes<[BWPort1,BWPort5]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup60], (instregex "VCVTDQ2PDYrr", + "VCVTPD2PSYrr", + "VCVT(T?)PD2DQYrr")>; + +def BWWriteResGroup62 : SchedWriteRes<[BWPort6,BWPort23]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup62], (instregex "FARJMP64", + "JMP(16|32|64)m")>; + +def BWWriteResGroup63 : SchedWriteRes<[BWPort23,BWPort06]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup63], (instregex "BT(16|32|64)mi8")>; + +def BWWriteResGroup64 : SchedWriteRes<[BWPort23,BWPort15]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup64], (instregex "ANDN(32|64)rm", + "BLSI(32|64)rm", + "BLSMSK(32|64)rm", + "BLSR(32|64)rm", + "MOVBE(16|32|64)rm")>; + +def BWWriteResGroup65 : SchedWriteRes<[BWPort23,BWPort015]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup65], (instregex "VINSERTF128rm", + "VINSERTI128rm", + "VPBLENDDrmi")>; + +def BWWriteResGroup66 : SchedWriteRes<[BWPort23,BWPort0156]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup66], (instrs POP16r, POP32r, POP64r)>; +def: InstRW<[BWWriteResGroup66], (instregex "POP(16|32|64)rmr")>; + +def BWWriteResGroup68 : SchedWriteRes<[BWPort1,BWPort6,BWPort06,BWPort0156]> { + let Latency = 6; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[BWWriteResGroup68], (instregex "SLDT(16|32|64)r")>; + +def BWWriteResGroup69 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort06]> { + let Latency = 6; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[BWWriteResGroup69], (instregex "BTC(16|32|64)mi8", + "BTR(16|32|64)mi8", + "BTS(16|32|64)mi8", + "SAR(8|16|32|64)m1", + "SAR(8|16|32|64)mi", + "SHL(8|16|32|64)m1", + "SHL(8|16|32|64)mi", + "SHR(8|16|32|64)m1", + "SHR(8|16|32|64)mi")>; + +def BWWriteResGroup70 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort0156]> { + let Latency = 6; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[BWWriteResGroup70], (instregex "POP(16|32|64)rmm", + "PUSH(16|32|64)rmm")>; + +def BWWriteResGroup71 : SchedWriteRes<[BWPort6,BWPort0156]> { + let Latency = 6; + let NumMicroOps = 6; + let ResourceCycles = [1,5]; +} +def: InstRW<[BWWriteResGroup71], (instrs STD)>; + +def BWWriteResGroup73 : SchedWriteRes<[BWPort0,BWPort23]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup73], (instregex "VPSLLVQYrm", + "VPSRLVQYrm")>; + +def BWWriteResGroup74 : SchedWriteRes<[BWPort1,BWPort23]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup74], (instregex "FCOM(P?)(32|64)m")>; + +def BWWriteResGroup77 : SchedWriteRes<[BWPort23,BWPort015]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup77], (instregex "VPBLENDDYrmi")>; + +def BWWriteResGroup79 : SchedWriteRes<[BWPort5,BWPort23]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[BWWriteResGroup79], (instregex "MMX_PACKSSDWirm", + "MMX_PACKSSWBirm", + "MMX_PACKUSWBirm")>; + +def BWWriteResGroup80 : SchedWriteRes<[BWPort23,BWPort0156]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[BWWriteResGroup80], (instrs LEAVE, LEAVE64, + SCASB, SCASL, SCASQ, SCASW)>; + +def BWWriteResGroup82 : SchedWriteRes<[BWPort0,BWPort01,BWPort23]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup82], (instrs FLDCW16m)>; + +def BWWriteResGroup84 : SchedWriteRes<[BWPort6,BWPort23,BWPort0156]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup84], (instrs LRETQ, RETQ)>; + +def BWWriteResGroup87 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort06]> { + let Latency = 7; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,2]; +} +def: InstRW<[BWWriteResGroup87], (instregex "ROL(8|16|32|64)m1", + "ROL(8|16|32|64)mi", + "ROR(8|16|32|64)m1", + "ROR(8|16|32|64)mi")>; + +def BWWriteResGroup88 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort0156]> { + let Latency = 7; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,2]; +} +def: InstRW<[BWWriteResGroup88], (instregex "XADD(8|16|32|64)rm")>; + +def BWWriteResGroup89 : SchedWriteRes<[BWPort4,BWPort6,BWPort23,BWPort237,BWPort0156]> { + let Latency = 7; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,1,1]; +} +def: InstRW<[BWWriteResGroup89], (instregex "CALL(16|32|64)m", + "FARCALL64")>; + +def BWWriteResGroup90 : SchedWriteRes<[BWPort6,BWPort06,BWPort15,BWPort0156]> { + let Latency = 7; + let NumMicroOps = 7; + let ResourceCycles = [2,2,1,2]; +} +def: InstRW<[BWWriteResGroup90], (instrs LOOP)>; + +def BWWriteResGroup91 : SchedWriteRes<[BWPort1,BWPort23]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup91], (instregex "MMX_CVTPI2PSirm", + "PDEP(32|64)rm", + "PEXT(32|64)rm", + "(V?)CVTDQ2PSrm")>; + +def BWWriteResGroup91_16 : SchedWriteRes<[BWPort1, BWPort0156, BWPort23]> { + let Latency = 8; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup91_16], (instrs IMUL16rmi, IMUL16rmi8)>; + +def BWWriteResGroup91_16_2 : SchedWriteRes<[BWPort1, BWPort06, BWPort0156, BWPort23]> { + let Latency = 9; + let NumMicroOps = 5; + let ResourceCycles = [1,1,2,1]; +} +def: InstRW<[BWWriteResGroup91_16_2], (instrs IMUL16m, MUL16m)>; + +def BWWriteResGroup92 : SchedWriteRes<[BWPort5,BWPort23]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup92], (instregex "VPMOVSXBDYrm", + "VPMOVSXBQYrm", + "VPMOVSXBWYrm", + "VPMOVSXDQYrm", + "VPMOVSXWDYrm", + "VPMOVSXWQYrm", + "VPMOVZXWDYrm")>; + +def BWWriteResGroup97 : SchedWriteRes<[BWPort23,BWPort237,BWPort06,BWPort0156]> { + let Latency = 8; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,2]; +} +def: InstRW<[BWWriteResGroup97], (instregex "RCL(8|16|32|64)m1", + "RCL(8|16|32|64)mi", + "RCR(8|16|32|64)m1", + "RCR(8|16|32|64)mi")>; + +def BWWriteResGroup98 : SchedWriteRes<[BWPort23,BWPort237,BWPort06,BWPort0156]> { + let Latency = 8; + let NumMicroOps = 5; + let ResourceCycles = [1,1,2,1]; +} +def: InstRW<[BWWriteResGroup98], (instregex "ROR(8|16|32|64)mCL")>; + +def BWWriteResGroup99 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort0156]> { + let Latency = 8; + let NumMicroOps = 6; + let ResourceCycles = [1,1,1,3]; +} +def: InstRW<[BWWriteResGroup99], (instregex "XCHG(8|16|32|64)rm")>; + +def BWWriteResGroup100 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort06,BWPort0156]> { + let Latency = 8; + let NumMicroOps = 6; + let ResourceCycles = [1,1,1,2,1]; +} +def : SchedAlias<WriteADCRMW, BWWriteResGroup100>; +def: InstRW<[BWWriteResGroup100], (instregex "CMPXCHG(8|16|32|64)rm", + "ROL(8|16|32|64)mCL", + "SAR(8|16|32|64)mCL", + "SHL(8|16|32|64)mCL", + "SHR(8|16|32|64)mCL")>; + +def BWWriteResGroup101 : SchedWriteRes<[BWPort1,BWPort23]> { + let Latency = 9; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup101], (instregex "(ADD|SUB|SUBR)_F(32|64)m", + "ILD_F(16|32|64)m", + "VCVTPS2DQYrm", + "VCVTTPS2DQYrm")>; + +def BWWriteResGroup105 : SchedWriteRes<[BWPort0,BWPort1,BWPort23]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup105], (instregex "(V?)CVTSS2SI(64)?rm", + "(V?)CVT(T?)SD2SI64rm", + "(V?)CVT(T?)SD2SIrm", + "VCVTTSS2SI64rm", + "(V?)CVTTSS2SIrm")>; + +def BWWriteResGroup106 : SchedWriteRes<[BWPort0,BWPort5,BWPort23]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup106], (instregex "VCVTPS2PDYrm")>; + +def BWWriteResGroup107 : SchedWriteRes<[BWPort1,BWPort5,BWPort23]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup107], (instrs IMUL64m, MUL64m, MULX64rm)>; +def: InstRW<[BWWriteResGroup107], (instregex "CVTPD2PSrm", + "CVT(T?)PD2DQrm", + "MMX_CVTPI2PDirm", + "MMX_CVT(T?)PD2PIirm", + "(V?)CVTDQ2PDrm", + "(V?)CVTSD2SSrm")>; + +def BWWriteResGroup108 : SchedWriteRes<[BWPort5,BWPort23,BWPort015]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup108], (instregex "VPBROADCASTB(Y?)rm", + "VPBROADCASTW(Y?)rm")>; + +def BWWriteResGroup112 : SchedWriteRes<[BWPort23,BWPort06,BWPort0156]> { + let Latency = 9; + let NumMicroOps = 5; + let ResourceCycles = [1,1,3]; +} +def: InstRW<[BWWriteResGroup112], (instregex "RDRAND(16|32|64)r")>; + +def BWWriteResGroup113 : SchedWriteRes<[BWPort1,BWPort6,BWPort23,BWPort0156]> { + let Latency = 9; + let NumMicroOps = 5; + let ResourceCycles = [1,2,1,1]; +} +def: InstRW<[BWWriteResGroup113], (instregex "LAR(16|32|64)rm", + "LSL(16|32|64)rm")>; + +def BWWriteResGroup115 : SchedWriteRes<[BWPort0,BWPort23]> { + let Latency = 10; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup115], (instregex "(V?)PCMPGTQrm")>; + +def BWWriteResGroup117 : SchedWriteRes<[BWPort1,BWPort23]> { + let Latency = 10; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[BWWriteResGroup117], (instregex "FICOM(P?)(16|32)m")>; + +def BWWriteResGroup120 : SchedWriteRes<[BWPort0,BWPort1,BWPort5,BWPort23]> { + let Latency = 10; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[BWWriteResGroup120], (instregex "CVTTSS2SI64rm")>; + +def BWWriteResGroup121 : SchedWriteRes<[BWPort1,BWPort23,BWPort06,BWPort0156]> { + let Latency = 9; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[BWWriteResGroup121], (instrs IMUL32m, MUL32m, MULX32rm)>; + +def BWWriteResGroup122_1 : SchedWriteRes<[BWPort0,BWFPDivider]> { + let Latency = 11; + let NumMicroOps = 1; + let ResourceCycles = [1,3]; // Really 2.5 cycle throughput +} +def : SchedAlias<WriteFDiv, BWWriteResGroup122_1>; // TODO - convert to ZnWriteResFpuPair + +def BWWriteResGroup123 : SchedWriteRes<[BWPort0,BWPort23]> { + let Latency = 11; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup123], (instregex "MUL_F(32|64)m", + "VPCMPGTQYrm")>; + +def BWWriteResGroup128 : SchedWriteRes<[BWPort1,BWPort5,BWPort23]> { + let Latency = 11; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup128], (instregex "VCVTDQ2PDYrm")>; + +def BWWriteResGroup131 : SchedWriteRes<[BWPort1,BWPort06,BWPort0156]> { + let Latency = 11; + let NumMicroOps = 7; + let ResourceCycles = [2,2,3]; +} +def: InstRW<[BWWriteResGroup131], (instregex "RCL(16|32|64)rCL", + "RCR(16|32|64)rCL")>; + +def BWWriteResGroup132 : SchedWriteRes<[BWPort1,BWPort06,BWPort15,BWPort0156]> { + let Latency = 11; + let NumMicroOps = 9; + let ResourceCycles = [1,4,1,3]; +} +def: InstRW<[BWWriteResGroup132], (instregex "RCL8rCL")>; + +def BWWriteResGroup133 : SchedWriteRes<[BWPort06,BWPort0156]> { + let Latency = 11; + let NumMicroOps = 11; + let ResourceCycles = [2,9]; +} +def: InstRW<[BWWriteResGroup133], (instrs LOOPE)>; +def: InstRW<[BWWriteResGroup133], (instrs LOOPNE)>; + +def BWWriteResGroup135 : SchedWriteRes<[BWPort1,BWPort23]> { + let Latency = 12; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[BWWriteResGroup135], (instregex "(ADD|SUB|SUBR)_FI(16|32)m")>; + +def BWWriteResGroup139_1 : SchedWriteRes<[BWPort0,BWFPDivider]> { + let Latency = 14; + let NumMicroOps = 1; + let ResourceCycles = [1,4]; +} +def : SchedAlias<WriteFDiv64, BWWriteResGroup139_1>; // TODO - convert to ZnWriteResFpuPair + +def BWWriteResGroup141 : SchedWriteRes<[BWPort0,BWPort1,BWPort23]> { + let Latency = 14; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup141], (instregex "MUL_FI(16|32)m")>; + +def BWWriteResGroup144 : SchedWriteRes<[BWPort1,BWPort6,BWPort23,BWPort0156]> { + let Latency = 14; + let NumMicroOps = 8; + let ResourceCycles = [2,2,1,3]; +} +def: InstRW<[BWWriteResGroup144], (instregex "LAR(16|32|64)rr")>; + +def BWWriteResGroup145 : SchedWriteRes<[BWPort1,BWPort06,BWPort15,BWPort0156]> { + let Latency = 14; + let NumMicroOps = 10; + let ResourceCycles = [2,3,1,4]; +} +def: InstRW<[BWWriteResGroup145], (instregex "RCR8rCL")>; + +def BWWriteResGroup146 : SchedWriteRes<[BWPort0,BWPort1,BWPort6,BWPort0156]> { + let Latency = 14; + let NumMicroOps = 12; + let ResourceCycles = [2,1,4,5]; +} +def: InstRW<[BWWriteResGroup146], (instrs XCH_F)>; + +def BWWriteResGroup147 : SchedWriteRes<[BWPort0]> { + let Latency = 15; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[BWWriteResGroup147], (instregex "DIVR_(FPrST0|FST0r|FrST0)")>; + +def BWWriteResGroup149 : SchedWriteRes<[BWPort1,BWPort23,BWPort237,BWPort06,BWPort15,BWPort0156]> { + let Latency = 15; + let NumMicroOps = 10; + let ResourceCycles = [1,1,1,4,1,2]; +} +def: InstRW<[BWWriteResGroup149], (instregex "RCL(8|16|32|64)mCL")>; + +def BWWriteResGroup150 : SchedWriteRes<[BWPort0,BWPort23,BWFPDivider]> { + let Latency = 16; + let NumMicroOps = 2; + let ResourceCycles = [1,1,5]; +} +def : SchedAlias<WriteFDivLd, BWWriteResGroup150>; // TODO - convert to ZnWriteResFpuPair + +def BWWriteResGroup153 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort06,BWPort15,BWPort0156]> { + let Latency = 16; + let NumMicroOps = 14; + let ResourceCycles = [1,1,1,4,2,5]; +} +def: InstRW<[BWWriteResGroup153], (instrs CMPXCHG8B)>; + +def BWWriteResGroup154 : SchedWriteRes<[BWPort5]> { + let Latency = 16; + let NumMicroOps = 16; + let ResourceCycles = [16]; +} +def: InstRW<[BWWriteResGroup154], (instrs VZEROALL)>; + +def BWWriteResGroup159 : SchedWriteRes<[BWPort5,BWPort6,BWPort06,BWPort0156]> { + let Latency = 18; + let NumMicroOps = 8; + let ResourceCycles = [1,1,1,5]; +} +def: InstRW<[BWWriteResGroup159], (instrs CPUID)>; +def: InstRW<[BWWriteResGroup159], (instrs RDTSC)>; + +def BWWriteResGroup160 : SchedWriteRes<[BWPort1,BWPort23,BWPort237,BWPort06,BWPort15,BWPort0156]> { + let Latency = 18; + let NumMicroOps = 11; + let ResourceCycles = [2,1,1,3,1,3]; +} +def: InstRW<[BWWriteResGroup160], (instregex "RCR(8|16|32|64)mCL")>; + +def BWWriteResGroup161 : SchedWriteRes<[BWPort0,BWPort23,BWFPDivider]> { + let Latency = 19; + let NumMicroOps = 2; + let ResourceCycles = [1,1,8]; +} +def : SchedAlias<WriteFDiv64Ld, BWWriteResGroup161>; // TODO - convert to ZnWriteResFpuPair + +def BWWriteResGroup165 : SchedWriteRes<[BWPort0]> { + let Latency = 20; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[BWWriteResGroup165], (instregex "DIV_(FPrST0|FST0r|FrST0)")>; + +def BWWriteResGroup167 : SchedWriteRes<[BWPort4,BWPort5,BWPort6,BWPort23,BWPort237,BWPort06,BWPort0156]> { + let Latency = 20; + let NumMicroOps = 8; + let ResourceCycles = [1,1,1,1,1,1,2]; +} +def: InstRW<[BWWriteResGroup167], (instrs INSB, INSL, INSW)>; + +def BWWriteResGroup169 : SchedWriteRes<[BWPort0,BWPort23]> { + let Latency = 21; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup169], (instregex "DIV_F(32|64)m")>; + +def BWWriteResGroup171 : SchedWriteRes<[BWPort0,BWPort4,BWPort5,BWPort23,BWPort237,BWPort06,BWPort0156]> { + let Latency = 21; + let NumMicroOps = 19; + let ResourceCycles = [2,1,4,1,1,4,6]; +} +def: InstRW<[BWWriteResGroup171], (instrs CMPXCHG16B)>; + +def BWWriteResGroup172 : SchedWriteRes<[BWPort6,BWPort23,BWPort0156]> { + let Latency = 22; + let NumMicroOps = 18; + let ResourceCycles = [1,1,16]; +} +def: InstRW<[BWWriteResGroup172], (instregex "POPF64")>; + +def BWWriteResGroup176 : SchedWriteRes<[BWPort6,BWPort23,BWPort0156]> { + let Latency = 23; + let NumMicroOps = 19; + let ResourceCycles = [3,1,15]; +} +def: InstRW<[BWWriteResGroup176], (instregex "XRSTOR(64)?")>; + +def BWWriteResGroup177 : SchedWriteRes<[BWPort0,BWPort1,BWPort23]> { + let Latency = 24; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup177], (instregex "DIV_FI(16|32)m")>; + +def BWWriteResGroup180 : SchedWriteRes<[BWPort0,BWPort23]> { + let Latency = 26; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup180], (instregex "DIVR_F(32|64)m")>; + +def BWWriteResGroup182 : SchedWriteRes<[BWPort0,BWPort1,BWPort23]> { + let Latency = 29; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup182], (instregex "DIVR_FI(16|32)m")>; + +def BWWriteResGroup183_1 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> { + let Latency = 22; + let NumMicroOps = 7; + let ResourceCycles = [1,3,2,1]; +} +def: InstRW<[BWWriteResGroup183_1], (instrs VGATHERQPDrm)>; + +def BWWriteResGroup183_2 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> { + let Latency = 23; + let NumMicroOps = 9; + let ResourceCycles = [1,3,4,1]; +} +def: InstRW<[BWWriteResGroup183_2], (instrs VGATHERQPDYrm)>; + +def BWWriteResGroup183_3 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> { + let Latency = 24; + let NumMicroOps = 9; + let ResourceCycles = [1,5,2,1]; +} +def: InstRW<[BWWriteResGroup183_3], (instrs VGATHERQPSYrm)>; + +def BWWriteResGroup183_4 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> { + let Latency = 25; + let NumMicroOps = 7; + let ResourceCycles = [1,3,2,1]; +} +def: InstRW<[BWWriteResGroup183_4], (instrs VGATHERDPDrm, + VGATHERDPSrm)>; + +def BWWriteResGroup183_5 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> { + let Latency = 26; + let NumMicroOps = 9; + let ResourceCycles = [1,5,2,1]; +} +def: InstRW<[BWWriteResGroup183_5], (instrs VGATHERDPDYrm)>; + +def BWWriteResGroup183_6 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> { + let Latency = 26; + let NumMicroOps = 14; + let ResourceCycles = [1,4,8,1]; +} +def: InstRW<[BWWriteResGroup183_6], (instrs VGATHERDPSYrm)>; + +def BWWriteResGroup183_7 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> { + let Latency = 27; + let NumMicroOps = 9; + let ResourceCycles = [1,5,2,1]; +} +def: InstRW<[BWWriteResGroup183_7], (instrs VGATHERQPSrm)>; + +def BWWriteResGroup185 : SchedWriteRes<[BWPort4,BWPort6,BWPort23,BWPort237,BWPort0156]> { + let Latency = 29; + let NumMicroOps = 27; + let ResourceCycles = [1,5,1,1,19]; +} +def: InstRW<[BWWriteResGroup185], (instrs XSAVE64)>; + +def BWWriteResGroup186 : SchedWriteRes<[BWPort4,BWPort6,BWPort23,BWPort237,BWPort0156]> { + let Latency = 30; + let NumMicroOps = 28; + let ResourceCycles = [1,6,1,1,19]; +} +def: InstRW<[BWWriteResGroup186], (instrs XSAVE)>; +def: InstRW<[BWWriteResGroup186], (instregex "XSAVEC", "XSAVES", "XSAVEOPT")>; + +def BWWriteResGroup190 : SchedWriteRes<[BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156]> { + let Latency = 34; + let NumMicroOps = 8; + let ResourceCycles = [2,2,2,1,1]; +} +def: InstRW<[BWWriteResGroup190], (instregex "DIV(8|16|32|64)m")>; + +def BWWriteResGroup191 : SchedWriteRes<[BWPort5,BWPort6,BWPort23,BWPort06,BWPort0156]> { + let Latency = 34; + let NumMicroOps = 23; + let ResourceCycles = [1,5,3,4,10]; +} +def: InstRW<[BWWriteResGroup191], (instregex "IN(8|16|32)ri", + "IN(8|16|32)rr")>; + +def BWWriteResGroup193 : SchedWriteRes<[BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156]> { + let Latency = 35; + let NumMicroOps = 8; + let ResourceCycles = [2,2,2,1,1]; +} +def: InstRW<[BWWriteResGroup193], (instregex "IDIV(8|16|32|64)m")>; + +def BWWriteResGroup194 : SchedWriteRes<[BWPort5,BWPort6,BWPort23,BWPort237,BWPort06,BWPort0156]> { + let Latency = 35; + let NumMicroOps = 23; + let ResourceCycles = [1,5,2,1,4,10]; +} +def: InstRW<[BWWriteResGroup194], (instregex "OUT(8|16|32)ir", + "OUT(8|16|32)rr")>; + +def BWWriteResGroup196 : SchedWriteRes<[BWPort5,BWPort0156]> { + let Latency = 42; + let NumMicroOps = 22; + let ResourceCycles = [2,20]; +} +def: InstRW<[BWWriteResGroup196], (instrs RDTSCP)>; + +def BWWriteResGroup197 : SchedWriteRes<[BWPort0,BWPort01,BWPort23,BWPort05,BWPort06,BWPort015,BWPort0156]> { + let Latency = 60; + let NumMicroOps = 64; + let ResourceCycles = [2,2,8,1,10,2,39]; +} +def: InstRW<[BWWriteResGroup197], (instrs FLDENVm)>; + +def BWWriteResGroup198 : SchedWriteRes<[BWPort0,BWPort6,BWPort23,BWPort05,BWPort06,BWPort15,BWPort0156]> { + let Latency = 63; + let NumMicroOps = 88; + let ResourceCycles = [4,4,31,1,2,1,45]; +} +def: InstRW<[BWWriteResGroup198], (instrs FXRSTOR64)>; + +def BWWriteResGroup199 : SchedWriteRes<[BWPort0,BWPort6,BWPort23,BWPort05,BWPort06,BWPort15,BWPort0156]> { + let Latency = 63; + let NumMicroOps = 90; + let ResourceCycles = [4,2,33,1,2,1,47]; +} +def: InstRW<[BWWriteResGroup199], (instrs FXRSTOR)>; + +def BWWriteResGroup200 : SchedWriteRes<[BWPort5,BWPort01,BWPort0156]> { + let Latency = 75; + let NumMicroOps = 15; + let ResourceCycles = [6,3,6]; +} +def: InstRW<[BWWriteResGroup200], (instrs FNINIT)>; + +def BWWriteResGroup201 : SchedWriteRes<[BWPort0,BWPort1,BWPort5,BWPort6,BWPort01,BWPort0156]> { + let Latency = 80; + let NumMicroOps = 32; + let ResourceCycles = [7,7,3,3,1,11]; +} +def: InstRW<[BWWriteResGroup201], (instregex "DIV(16|32|64)r")>; + +def BWWriteResGroup202 : SchedWriteRes<[BWPort0,BWPort1,BWPort4,BWPort5,BWPort6,BWPort237,BWPort06,BWPort0156]> { + let Latency = 115; + let NumMicroOps = 100; + let ResourceCycles = [9,9,11,8,1,11,21,30]; +} +def: InstRW<[BWWriteResGroup202], (instrs FSTENVm)>; + +def: InstRW<[WriteZero], (instrs CLC)>; + +} // SchedModel diff --git a/capstone/suite/synctools/tablegen/X86/back/X86SchedHaswell.td b/capstone/suite/synctools/tablegen/X86/back/X86SchedHaswell.td new file mode 100644 index 000000000..876c3e416 --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86SchedHaswell.td @@ -0,0 +1,1975 @@ +//=- X86SchedHaswell.td - X86 Haswell Scheduling -------------*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for Haswell to support instruction +// scheduling and other instruction cost heuristics. +// +// Note that we define some instructions here that are not supported by haswell, +// but we still have to define them because KNL uses the HSW model. +// They are currently tagged with a comment `Unsupported = 1`. +// FIXME: Use Unsupported = 1 once KNL has its own model. +// +//===----------------------------------------------------------------------===// + +def HaswellModel : SchedMachineModel { + // All x86 instructions are modeled as a single micro-op, and HW can decode 4 + // instructions per cycle. + let IssueWidth = 4; + let MicroOpBufferSize = 192; // Based on the reorder buffer. + let LoadLatency = 5; + let MispredictPenalty = 16; + + // Based on the LSD (loop-stream detector) queue size and benchmarking data. + let LoopMicroOpBufferSize = 50; + + // This flag is set to allow the scheduler to assign a default model to + // unrecognized opcodes. + let CompleteModel = 0; +} + +let SchedModel = HaswellModel in { + +// Haswell can issue micro-ops to 8 different ports in one cycle. + +// Ports 0, 1, 5, and 6 handle all computation. +// Port 4 gets the data half of stores. Store data can be available later than +// the store address, but since we don't model the latency of stores, we can +// ignore that. +// Ports 2 and 3 are identical. They handle loads and the address half of +// stores. Port 7 can handle address calculations. +def HWPort0 : ProcResource<1>; +def HWPort1 : ProcResource<1>; +def HWPort2 : ProcResource<1>; +def HWPort3 : ProcResource<1>; +def HWPort4 : ProcResource<1>; +def HWPort5 : ProcResource<1>; +def HWPort6 : ProcResource<1>; +def HWPort7 : ProcResource<1>; + +// Many micro-ops are capable of issuing on multiple ports. +def HWPort01 : ProcResGroup<[HWPort0, HWPort1]>; +def HWPort23 : ProcResGroup<[HWPort2, HWPort3]>; +def HWPort237 : ProcResGroup<[HWPort2, HWPort3, HWPort7]>; +def HWPort04 : ProcResGroup<[HWPort0, HWPort4]>; +def HWPort05 : ProcResGroup<[HWPort0, HWPort5]>; +def HWPort06 : ProcResGroup<[HWPort0, HWPort6]>; +def HWPort15 : ProcResGroup<[HWPort1, HWPort5]>; +def HWPort16 : ProcResGroup<[HWPort1, HWPort6]>; +def HWPort56 : ProcResGroup<[HWPort5, HWPort6]>; +def HWPort015 : ProcResGroup<[HWPort0, HWPort1, HWPort5]>; +def HWPort056 : ProcResGroup<[HWPort0, HWPort5, HWPort6]>; +def HWPort0156: ProcResGroup<[HWPort0, HWPort1, HWPort5, HWPort6]>; + +// 60 Entry Unified Scheduler +def HWPortAny : ProcResGroup<[HWPort0, HWPort1, HWPort2, HWPort3, HWPort4, + HWPort5, HWPort6, HWPort7]> { + let BufferSize=60; +} + +// Integer division issued on port 0. +def HWDivider : ProcResource<1>; +// FP division and sqrt on port 0. +def HWFPDivider : ProcResource<1>; + +// Loads are 5 cycles, so ReadAfterLd registers needn't be available until 5 +// cycles after the memory operand. +def : ReadAdvance<ReadAfterLd, 5>; + +// Many SchedWrites are defined in pairs with and without a folded load. +// Instructions with folded loads are usually micro-fused, so they only appear +// as two micro-ops when queued in the reservation station. +// This multiclass defines the resource usage for variants with and without +// folded loads. +multiclass HWWriteResPair<X86FoldableSchedWrite SchedRW, + list<ProcResourceKind> ExePorts, + int Lat, list<int> Res = [1], int UOps = 1, + int LoadLat = 5> { + // Register variant is using a single cycle on ExePort. + def : WriteRes<SchedRW, ExePorts> { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } + + // Memory variant also uses a cycle on port 2/3 and adds LoadLat cycles to + // the latency (default = 5). + def : WriteRes<SchedRW.Folded, !listconcat([HWPort23], ExePorts)> { + let Latency = !add(Lat, LoadLat); + let ResourceCycles = !listconcat([1], Res); + let NumMicroOps = !add(UOps, 1); + } +} + +// A folded store needs a cycle on port 4 for the store data, and an extra port +// 2/3/7 cycle to recompute the address. +def : WriteRes<WriteRMW, [HWPort237,HWPort4]>; + +// Store_addr on 237. +// Store_data on 4. +defm : X86WriteRes<WriteStore, [HWPort237, HWPort4], 1, [1,1], 1>; +defm : X86WriteRes<WriteStoreNT, [HWPort237, HWPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteLoad, [HWPort23], 5, [1], 1>; +defm : X86WriteRes<WriteMove, [HWPort0156], 1, [1], 1>; +def : WriteRes<WriteZero, []>; + +// Arithmetic. +defm : HWWriteResPair<WriteALU, [HWPort0156], 1>; +defm : HWWriteResPair<WriteADC, [HWPort06, HWPort0156], 2, [1,1], 2>; +defm : HWWriteResPair<WriteIMul, [HWPort1], 3>; +defm : HWWriteResPair<WriteIMul64, [HWPort1], 3>; + +defm : X86WriteRes<WriteBSWAP32, [HWPort15], 1, [1], 1>; +defm : X86WriteRes<WriteBSWAP64, [HWPort06, HWPort15], 2, [1,1], 2>; + +def : WriteRes<WriteIMulH, []> { let Latency = 3; } + +// Integer shifts and rotates. +defm : HWWriteResPair<WriteShift, [HWPort06], 1>; + +// SHLD/SHRD. +defm : X86WriteRes<WriteSHDrri, [HWPort1], 3, [1], 1>; +defm : X86WriteRes<WriteSHDrrcl,[HWPort1, HWPort06, HWPort0156], 6, [1, 1, 2], 4>; +defm : X86WriteRes<WriteSHDmri, [HWPort1, HWPort23, HWPort237, HWPort0156], 10, [1, 1, 1, 1], 4>; +defm : X86WriteRes<WriteSHDmrcl,[HWPort1, HWPort23, HWPort237, HWPort06, HWPort0156], 12, [1, 1, 1, 1, 2], 6>; + +defm : HWWriteResPair<WriteJump, [HWPort06], 1>; +defm : HWWriteResPair<WriteCRC32, [HWPort1], 3>; + +defm : HWWriteResPair<WriteCMOV, [HWPort06,HWPort0156], 2, [1,1], 2>; // Conditional move. +defm : HWWriteResPair<WriteCMOV2, [HWPort06,HWPort0156], 3, [1,2], 3>; // Conditional (CF + ZF flag) move. +defm : X86WriteRes<WriteFCMOV, [HWPort1], 3, [1], 1>; // x87 conditional move. +def : WriteRes<WriteSETCC, [HWPort06]>; // Setcc. +def : WriteRes<WriteSETCCStore, [HWPort06,HWPort4,HWPort237]> { + let Latency = 2; + let NumMicroOps = 3; +} +def : WriteRes<WriteLAHFSAHF, [HWPort06]>; +def : WriteRes<WriteBitTest,[HWPort06]>; + +// This is for simple LEAs with one or two input operands. +// The complex ones can only execute on port 1, and they require two cycles on +// the port to read all inputs. We don't model that. +def : WriteRes<WriteLEA, [HWPort15]>; + +// Bit counts. +defm : HWWriteResPair<WriteBSF, [HWPort1], 3>; +defm : HWWriteResPair<WriteBSR, [HWPort1], 3>; +defm : HWWriteResPair<WriteLZCNT, [HWPort1], 3>; +defm : HWWriteResPair<WriteTZCNT, [HWPort1], 3>; +defm : HWWriteResPair<WritePOPCNT, [HWPort1], 3>; + +// BMI1 BEXTR, BMI2 BZHI +defm : HWWriteResPair<WriteBEXTR, [HWPort06,HWPort15], 2, [1,1], 2>; +defm : HWWriteResPair<WriteBZHI, [HWPort15], 1>; + +defm : HWWriteResPair<WriteDiv8, [HWPort0, HWDivider], 25, [1,10], 1, 4>; +defm : HWWriteResPair<WriteDiv16, [HWPort0, HWDivider], 25, [1,10], 1, 4>; +defm : HWWriteResPair<WriteDiv32, [HWPort0, HWDivider], 25, [1,10], 1, 4>; +defm : HWWriteResPair<WriteDiv64, [HWPort0, HWDivider], 25, [1,10], 1, 4>; +defm : HWWriteResPair<WriteIDiv8, [HWPort0, HWDivider], 25, [1,10], 1, 4>; +defm : HWWriteResPair<WriteIDiv16, [HWPort0, HWDivider], 25, [1,10], 1, 4>; +defm : HWWriteResPair<WriteIDiv32, [HWPort0, HWDivider], 25, [1,10], 1, 4>; +defm : HWWriteResPair<WriteIDiv64, [HWPort0, HWDivider], 25, [1,10], 1, 4>; + +// Scalar and vector floating point. +defm : X86WriteRes<WriteFLD0, [HWPort01], 1, [1], 1>; +defm : X86WriteRes<WriteFLD1, [HWPort01], 1, [2], 2>; +defm : X86WriteRes<WriteFLDC, [HWPort01], 1, [2], 2>; +defm : X86WriteRes<WriteFLoad, [HWPort23], 5, [1], 1>; +defm : X86WriteRes<WriteFLoadX, [HWPort23], 6, [1], 1>; +defm : X86WriteRes<WriteFLoadY, [HWPort23], 7, [1], 1>; +defm : X86WriteRes<WriteFMaskedLoad, [HWPort23,HWPort5], 8, [1,2], 3>; +defm : X86WriteRes<WriteFMaskedLoadY, [HWPort23,HWPort5], 9, [1,2], 3>; +defm : X86WriteRes<WriteFStore, [HWPort237,HWPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteFStoreX, [HWPort237,HWPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteFStoreY, [HWPort237,HWPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteFStoreNT, [HWPort237,HWPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteFStoreNTX, [HWPort237,HWPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteFStoreNTY, [HWPort237,HWPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteFMaskedStore, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>; +defm : X86WriteRes<WriteFMaskedStoreY, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>; +defm : X86WriteRes<WriteFMove, [HWPort5], 1, [1], 1>; +defm : X86WriteRes<WriteFMoveX, [HWPort5], 1, [1], 1>; +defm : X86WriteRes<WriteFMoveY, [HWPort5], 1, [1], 1>; +defm : X86WriteRes<WriteEMMS, [HWPort01,HWPort15,HWPort015,HWPort0156], 31, [8,1,21,1], 31>; + +defm : HWWriteResPair<WriteFAdd, [HWPort1], 3, [1], 1, 5>; +defm : HWWriteResPair<WriteFAddX, [HWPort1], 3, [1], 1, 6>; +defm : HWWriteResPair<WriteFAddY, [HWPort1], 3, [1], 1, 7>; +defm : HWWriteResPair<WriteFAddZ, [HWPort1], 3, [1], 1, 7>; // Unsupported = 1 +defm : HWWriteResPair<WriteFAdd64, [HWPort1], 3, [1], 1, 5>; +defm : HWWriteResPair<WriteFAdd64X, [HWPort1], 3, [1], 1, 6>; +defm : HWWriteResPair<WriteFAdd64Y, [HWPort1], 3, [1], 1, 7>; +defm : HWWriteResPair<WriteFAdd64Z, [HWPort1], 3, [1], 1, 7>; // Unsupported = 1 + +defm : HWWriteResPair<WriteFCmp, [HWPort1], 3, [1], 1, 5>; +defm : HWWriteResPair<WriteFCmpX, [HWPort1], 3, [1], 1, 6>; +defm : HWWriteResPair<WriteFCmpY, [HWPort1], 3, [1], 1, 7>; +defm : HWWriteResPair<WriteFCmpZ, [HWPort1], 3, [1], 1, 7>; // Unsupported = 1 +defm : HWWriteResPair<WriteFCmp64, [HWPort1], 3, [1], 1, 5>; +defm : HWWriteResPair<WriteFCmp64X, [HWPort1], 3, [1], 1, 6>; +defm : HWWriteResPair<WriteFCmp64Y, [HWPort1], 3, [1], 1, 7>; +defm : HWWriteResPair<WriteFCmp64Z, [HWPort1], 3, [1], 1, 7>; // Unsupported = 1 + +defm : HWWriteResPair<WriteFCom, [HWPort1], 3>; + +defm : HWWriteResPair<WriteFMul, [HWPort01], 5, [1], 1, 5>; +defm : HWWriteResPair<WriteFMulX, [HWPort01], 5, [1], 1, 6>; +defm : HWWriteResPair<WriteFMulY, [HWPort01], 5, [1], 1, 7>; +defm : HWWriteResPair<WriteFMulZ, [HWPort01], 5, [1], 1, 7>; // Unsupported = 1 +defm : HWWriteResPair<WriteFMul64, [HWPort01], 5, [1], 1, 5>; +defm : HWWriteResPair<WriteFMul64X, [HWPort01], 5, [1], 1, 6>; +defm : HWWriteResPair<WriteFMul64Y, [HWPort01], 5, [1], 1, 7>; +defm : HWWriteResPair<WriteFMul64Z, [HWPort01], 5, [1], 1, 7>; // Unsupported = 1 + +defm : HWWriteResPair<WriteFDiv, [HWPort0,HWFPDivider], 13, [1,7], 1, 5>; +defm : HWWriteResPair<WriteFDivX, [HWPort0,HWFPDivider], 13, [1,7], 1, 6>; +defm : HWWriteResPair<WriteFDivY, [HWPort0,HWPort15,HWFPDivider], 21, [2,1,14], 3, 7>; +defm : HWWriteResPair<WriteFDivZ, [HWPort0,HWPort15,HWFPDivider], 21, [2,1,14], 3, 7>; // Unsupported = 1 +defm : HWWriteResPair<WriteFDiv64, [HWPort0,HWFPDivider], 20, [1,14], 1, 5>; +defm : HWWriteResPair<WriteFDiv64X, [HWPort0,HWFPDivider], 20, [1,14], 1, 6>; +defm : HWWriteResPair<WriteFDiv64Y, [HWPort0,HWPort15,HWFPDivider], 35, [2,1,28], 3, 7>; +defm : HWWriteResPair<WriteFDiv64Z, [HWPort0,HWPort15,HWFPDivider], 35, [2,1,28], 3, 7>; // Unsupported = 1 + +defm : HWWriteResPair<WriteFRcp, [HWPort0], 5, [1], 1, 5>; +defm : HWWriteResPair<WriteFRcpX, [HWPort0], 5, [1], 1, 6>; +defm : HWWriteResPair<WriteFRcpY, [HWPort0,HWPort015], 11, [2,1], 3, 7>; +defm : HWWriteResPair<WriteFRcpZ, [HWPort0,HWPort015], 11, [2,1], 3, 7>; // Unsupported = 1 + +defm : HWWriteResPair<WriteFRsqrt, [HWPort0], 5, [1], 1, 5>; +defm : HWWriteResPair<WriteFRsqrtX,[HWPort0], 5, [1], 1, 6>; +defm : HWWriteResPair<WriteFRsqrtY,[HWPort0,HWPort015], 11, [2,1], 3, 7>; +defm : HWWriteResPair<WriteFRsqrtZ,[HWPort0,HWPort015], 11, [2,1], 3, 7>; // Unsupported = 1 + +defm : HWWriteResPair<WriteFSqrt, [HWPort0,HWFPDivider], 11, [1,7], 1, 5>; +defm : HWWriteResPair<WriteFSqrtX, [HWPort0,HWFPDivider], 11, [1,7], 1, 6>; +defm : HWWriteResPair<WriteFSqrtY, [HWPort0,HWPort15,HWFPDivider], 21, [2,1,14], 3, 7>; +defm : HWWriteResPair<WriteFSqrtZ, [HWPort0,HWPort15,HWFPDivider], 21, [2,1,14], 3, 7>; // Unsupported = 1 +defm : HWWriteResPair<WriteFSqrt64, [HWPort0,HWFPDivider], 16, [1,14], 1, 5>; +defm : HWWriteResPair<WriteFSqrt64X, [HWPort0,HWFPDivider], 16, [1,14], 1, 6>; +defm : HWWriteResPair<WriteFSqrt64Y, [HWPort0,HWPort15,HWFPDivider], 35, [2,1,28], 3, 7>; +defm : HWWriteResPair<WriteFSqrt64Z, [HWPort0,HWPort15,HWFPDivider], 35, [2,1,28], 3, 7>; // Unsupported = 1 +defm : HWWriteResPair<WriteFSqrt80, [HWPort0,HWFPDivider], 23, [1,17]>; + +defm : HWWriteResPair<WriteFMA, [HWPort01], 5, [1], 1, 5>; +defm : HWWriteResPair<WriteFMAX, [HWPort01], 5, [1], 1, 6>; +defm : HWWriteResPair<WriteFMAY, [HWPort01], 5, [1], 1, 7>; +defm : HWWriteResPair<WriteFMAZ, [HWPort01], 5, [1], 1, 7>; // Unsupported = 1 +defm : HWWriteResPair<WriteDPPD, [HWPort0,HWPort1,HWPort5], 9, [1,1,1], 3, 6>; +defm : HWWriteResPair<WriteDPPS, [HWPort0,HWPort1,HWPort5], 14, [2,1,1], 4, 6>; +defm : HWWriteResPair<WriteDPPSY, [HWPort0,HWPort1,HWPort5], 14, [2,1,1], 4, 7>; +defm : HWWriteResPair<WriteDPPSZ, [HWPort0,HWPort1,HWPort5], 14, [2,1,1], 4, 7>; // Unsupported = 1 +defm : HWWriteResPair<WriteFSign, [HWPort0], 1>; +defm : X86WriteRes<WriteFRnd, [HWPort23], 6, [1], 1>; +defm : X86WriteRes<WriteFRndY, [HWPort23], 6, [1], 1>; +defm : X86WriteRes<WriteFRndZ, [HWPort23], 6, [1], 1>; // Unsupported = 1 +defm : X86WriteRes<WriteFRndLd, [HWPort1,HWPort23], 12, [2,1], 3>; +defm : X86WriteRes<WriteFRndYLd, [HWPort1,HWPort23], 13, [2,1], 3>; +defm : X86WriteRes<WriteFRndZLd, [HWPort1,HWPort23], 13, [2,1], 3>; // Unsupported = 1 +defm : HWWriteResPair<WriteFLogic, [HWPort5], 1, [1], 1, 6>; +defm : HWWriteResPair<WriteFLogicY, [HWPort5], 1, [1], 1, 7>; +defm : HWWriteResPair<WriteFLogicZ, [HWPort5], 1, [1], 1, 7>; // Unsupported = 1 +defm : HWWriteResPair<WriteFTest, [HWPort0], 1, [1], 1, 6>; +defm : HWWriteResPair<WriteFTestY, [HWPort0], 1, [1], 1, 7>; +defm : HWWriteResPair<WriteFTestZ, [HWPort0], 1, [1], 1, 7>; // Unsupported = 1 +defm : HWWriteResPair<WriteFShuffle, [HWPort5], 1, [1], 1, 6>; +defm : HWWriteResPair<WriteFShuffleY, [HWPort5], 1, [1], 1, 7>; +defm : HWWriteResPair<WriteFShuffleZ, [HWPort5], 1, [1], 1, 7>; // Unsupported = 1 +defm : HWWriteResPair<WriteFVarShuffle, [HWPort5], 1, [1], 1, 6>; +defm : HWWriteResPair<WriteFVarShuffleY, [HWPort5], 1, [1], 1, 7>; +defm : HWWriteResPair<WriteFVarShuffleZ, [HWPort5], 1, [1], 1, 7>; // Unsupported = 1 +defm : HWWriteResPair<WriteFBlend, [HWPort015], 1, [1], 1, 6>; +defm : HWWriteResPair<WriteFBlendY, [HWPort015], 1, [1], 1, 7>; +defm : HWWriteResPair<WriteFBlendZ, [HWPort015], 1, [1], 1, 7>; // Unsupported = 1 +defm : HWWriteResPair<WriteFShuffle256, [HWPort5], 3, [1], 1, 7>; +defm : HWWriteResPair<WriteFVarShuffle256, [HWPort5], 3, [1], 1, 7>; +defm : HWWriteResPair<WriteFVarBlend, [HWPort5], 2, [2], 2, 6>; +defm : HWWriteResPair<WriteFVarBlendY, [HWPort5], 2, [2], 2, 7>; +defm : HWWriteResPair<WriteFVarBlendZ, [HWPort5], 2, [2], 2, 7>; // Unsupported = 1 + +// Conversion between integer and float. +defm : HWWriteResPair<WriteCvtSD2I, [HWPort1], 3>; +defm : HWWriteResPair<WriteCvtPD2I, [HWPort1], 3>; +defm : HWWriteResPair<WriteCvtPD2IY, [HWPort1], 3>; +defm : HWWriteResPair<WriteCvtPD2IZ, [HWPort1], 3>; // Unsupported = 1 +defm : HWWriteResPair<WriteCvtSS2I, [HWPort1], 3>; +defm : HWWriteResPair<WriteCvtPS2I, [HWPort1], 3>; +defm : HWWriteResPair<WriteCvtPS2IY, [HWPort1], 3>; +defm : HWWriteResPair<WriteCvtPS2IZ, [HWPort1], 3>; // Unsupported = 1 + +defm : HWWriteResPair<WriteCvtI2SD, [HWPort1], 4>; +defm : HWWriteResPair<WriteCvtI2PD, [HWPort1], 4>; +defm : HWWriteResPair<WriteCvtI2PDY, [HWPort1], 4>; +defm : HWWriteResPair<WriteCvtI2PDZ, [HWPort1], 4>; // Unsupported = 1 +defm : HWWriteResPair<WriteCvtI2SS, [HWPort1], 4>; +defm : HWWriteResPair<WriteCvtI2PS, [HWPort1], 4>; +defm : HWWriteResPair<WriteCvtI2PSY, [HWPort1], 4>; +defm : HWWriteResPair<WriteCvtI2PSZ, [HWPort1], 4>; // Unsupported = 1 + +defm : HWWriteResPair<WriteCvtSS2SD, [HWPort1], 3>; +defm : HWWriteResPair<WriteCvtPS2PD, [HWPort1], 3>; +defm : HWWriteResPair<WriteCvtPS2PDY, [HWPort1], 3>; +defm : HWWriteResPair<WriteCvtPS2PDZ, [HWPort1], 3>; // Unsupported = 1 +defm : HWWriteResPair<WriteCvtSD2SS, [HWPort1], 3>; +defm : HWWriteResPair<WriteCvtPD2PS, [HWPort1], 3>; +defm : HWWriteResPair<WriteCvtPD2PSY, [HWPort1], 3>; +defm : HWWriteResPair<WriteCvtPD2PSZ, [HWPort1], 3>; // Unsupported = 1 + +defm : X86WriteRes<WriteCvtPH2PS, [HWPort0,HWPort5], 2, [1,1], 2>; +defm : X86WriteRes<WriteCvtPH2PSY, [HWPort0,HWPort5], 2, [1,1], 2>; +defm : X86WriteRes<WriteCvtPH2PSZ, [HWPort0,HWPort5], 2, [1,1], 2>; // Unsupported = 1 +defm : X86WriteRes<WriteCvtPH2PSLd, [HWPort0,HWPort23], 6, [1,1], 2>; +defm : X86WriteRes<WriteCvtPH2PSYLd, [HWPort0,HWPort23], 7, [1,1], 2>; +defm : X86WriteRes<WriteCvtPH2PSZLd, [HWPort0,HWPort23], 7, [1,1], 2>; // Unsupported = 1 + +defm : X86WriteRes<WriteCvtPS2PH, [HWPort1,HWPort5], 4, [1,1], 2>; +defm : X86WriteRes<WriteCvtPS2PHY, [HWPort1,HWPort5], 6, [1,1], 2>; +defm : X86WriteRes<WriteCvtPS2PHZ, [HWPort1,HWPort5], 6, [1,1], 2>; // Unsupported = 1 +defm : X86WriteRes<WriteCvtPS2PHSt, [HWPort1,HWPort4,HWPort5,HWPort237], 5, [1,1,1,1], 4>; +defm : X86WriteRes<WriteCvtPS2PHYSt, [HWPort1,HWPort4,HWPort5,HWPort237], 7, [1,1,1,1], 4>; +defm : X86WriteRes<WriteCvtPS2PHZSt, [HWPort1,HWPort4,HWPort5,HWPort237], 7, [1,1,1,1], 4>; // Unsupported = 1 + +// Vector integer operations. +defm : X86WriteRes<WriteVecLoad, [HWPort23], 5, [1], 1>; +defm : X86WriteRes<WriteVecLoadX, [HWPort23], 6, [1], 1>; +defm : X86WriteRes<WriteVecLoadY, [HWPort23], 7, [1], 1>; +defm : X86WriteRes<WriteVecLoadNT, [HWPort23], 6, [1], 1>; +defm : X86WriteRes<WriteVecLoadNTY, [HWPort23], 7, [1], 1>; +defm : X86WriteRes<WriteVecMaskedLoad, [HWPort23,HWPort5], 8, [1,2], 3>; +defm : X86WriteRes<WriteVecMaskedLoadY, [HWPort23,HWPort5], 9, [1,2], 3>; +defm : X86WriteRes<WriteVecStore, [HWPort237,HWPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteVecStoreX, [HWPort237,HWPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteVecStoreY, [HWPort237,HWPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteVecStoreNT, [HWPort237,HWPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteVecStoreNTY, [HWPort237,HWPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteVecMaskedStore, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>; +defm : X86WriteRes<WriteVecMaskedStoreY, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>; +defm : X86WriteRes<WriteVecMove, [HWPort015], 1, [1], 1>; +defm : X86WriteRes<WriteVecMoveX, [HWPort015], 1, [1], 1>; +defm : X86WriteRes<WriteVecMoveY, [HWPort015], 1, [1], 1>; +defm : X86WriteRes<WriteVecMoveToGpr, [HWPort0], 1, [1], 1>; +defm : X86WriteRes<WriteVecMoveFromGpr, [HWPort5], 1, [1], 1>; + +defm : HWWriteResPair<WriteVecLogic, [HWPort015], 1, [1], 1, 5>; +defm : HWWriteResPair<WriteVecLogicX,[HWPort015], 1, [1], 1, 6>; +defm : HWWriteResPair<WriteVecLogicY,[HWPort015], 1, [1], 1, 7>; +defm : HWWriteResPair<WriteVecLogicZ,[HWPort015], 1, [1], 1, 7>; // Unsupported = 1 +defm : HWWriteResPair<WriteVecTest, [HWPort0,HWPort5], 2, [1,1], 2, 6>; +defm : HWWriteResPair<WriteVecTestY, [HWPort0,HWPort5], 4, [1,1], 2, 7>; +defm : HWWriteResPair<WriteVecTestZ, [HWPort0,HWPort5], 4, [1,1], 2, 7>; // Unsupported = 1 +defm : HWWriteResPair<WriteVecALU, [HWPort15], 1, [1], 1, 5>; +defm : HWWriteResPair<WriteVecALUX, [HWPort15], 1, [1], 1, 6>; +defm : HWWriteResPair<WriteVecALUY, [HWPort15], 1, [1], 1, 7>; +defm : HWWriteResPair<WriteVecALUZ, [HWPort15], 1, [1], 1, 7>; // Unsupported = 1 +defm : HWWriteResPair<WriteVecIMul, [HWPort0], 5, [1], 1, 5>; +defm : HWWriteResPair<WriteVecIMulX, [HWPort0], 5, [1], 1, 6>; +defm : HWWriteResPair<WriteVecIMulY, [HWPort0], 5, [1], 1, 7>; +defm : HWWriteResPair<WriteVecIMulZ, [HWPort0], 5, [1], 1, 7>; // Unsupported = 1 +defm : HWWriteResPair<WritePMULLD, [HWPort0], 10, [2], 2, 6>; +defm : HWWriteResPair<WritePMULLDY, [HWPort0], 10, [2], 2, 7>; +defm : HWWriteResPair<WritePMULLDZ, [HWPort0], 10, [2], 2, 7>; // Unsupported = 1 +defm : HWWriteResPair<WriteShuffle, [HWPort5], 1, [1], 1, 5>; +defm : HWWriteResPair<WriteShuffleX, [HWPort5], 1, [1], 1, 6>; +defm : HWWriteResPair<WriteShuffleY, [HWPort5], 1, [1], 1, 7>; +defm : HWWriteResPair<WriteShuffleZ, [HWPort5], 1, [1], 1, 7>; // Unsupported = 1 +defm : HWWriteResPair<WriteVarShuffle, [HWPort5], 1, [1], 1, 5>; +defm : HWWriteResPair<WriteVarShuffleX,[HWPort5], 1, [1], 1, 6>; +defm : HWWriteResPair<WriteVarShuffleY,[HWPort5], 1, [1], 1, 7>; +defm : HWWriteResPair<WriteVarShuffleZ,[HWPort5], 1, [1], 1, 7>; // Unsupported = 1 +defm : HWWriteResPair<WriteBlend, [HWPort5], 1, [1], 1, 6>; +defm : HWWriteResPair<WriteBlendY, [HWPort5], 1, [1], 1, 7>; +defm : HWWriteResPair<WriteBlendZ, [HWPort5], 1, [1], 1, 7>; // Unsupported = 1 +defm : HWWriteResPair<WriteShuffle256, [HWPort5], 3, [1], 1, 7>; +defm : HWWriteResPair<WriteVarShuffle256, [HWPort5], 3, [1], 1, 7>; +defm : HWWriteResPair<WriteVarBlend, [HWPort5], 2, [2], 2, 6>; +defm : HWWriteResPair<WriteVarBlendY, [HWPort5], 2, [2], 2, 7>; +defm : HWWriteResPair<WriteVarBlendZ, [HWPort5], 2, [2], 2, 7>; // Unsupported = 1 +defm : HWWriteResPair<WriteMPSAD, [HWPort0, HWPort5], 7, [1, 2], 3, 6>; +defm : HWWriteResPair<WriteMPSADY, [HWPort0, HWPort5], 7, [1, 2], 3, 7>; +defm : HWWriteResPair<WriteMPSADZ, [HWPort0, HWPort5], 7, [1, 2], 3, 7>; // Unsupported = 1 +defm : HWWriteResPair<WritePSADBW, [HWPort0], 5, [1], 1, 5>; +defm : HWWriteResPair<WritePSADBWX, [HWPort0], 5, [1], 1, 6>; +defm : HWWriteResPair<WritePSADBWY, [HWPort0], 5, [1], 1, 7>; +defm : HWWriteResPair<WritePSADBWZ, [HWPort0], 5, [1], 1, 7>; // Unsupported = 1 +defm : HWWriteResPair<WritePHMINPOS, [HWPort0], 5, [1], 1, 6>; + +// Vector integer shifts. +defm : HWWriteResPair<WriteVecShift, [HWPort0], 1, [1], 1, 5>; +defm : HWWriteResPair<WriteVecShiftX, [HWPort0,HWPort5], 2, [1,1], 2, 6>; +defm : X86WriteRes<WriteVecShiftY, [HWPort0,HWPort5], 4, [1,1], 2>; +defm : X86WriteRes<WriteVecShiftZ, [HWPort0,HWPort5], 4, [1,1], 2>; // Unsupported = 1 +defm : X86WriteRes<WriteVecShiftYLd, [HWPort0,HWPort23], 8, [1,1], 2>; +defm : X86WriteRes<WriteVecShiftZLd, [HWPort0,HWPort23], 8, [1,1], 2>; // Unsupported = 1 + +defm : HWWriteResPair<WriteVecShiftImm, [HWPort0], 1, [1], 1, 5>; +defm : HWWriteResPair<WriteVecShiftImmX, [HWPort0], 1, [1], 1, 6>; +defm : HWWriteResPair<WriteVecShiftImmY, [HWPort0], 1, [1], 1, 7>; +defm : HWWriteResPair<WriteVecShiftImmZ, [HWPort0], 1, [1], 1, 7>; // Unsupported = 1 +defm : HWWriteResPair<WriteVarVecShift, [HWPort0, HWPort5], 3, [2,1], 3, 6>; +defm : HWWriteResPair<WriteVarVecShiftY, [HWPort0, HWPort5], 3, [2,1], 3, 7>; +defm : HWWriteResPair<WriteVarVecShiftZ, [HWPort0, HWPort5], 3, [2,1], 3, 7>; // Unsupported = 1 + +// Vector insert/extract operations. +def : WriteRes<WriteVecInsert, [HWPort5]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def : WriteRes<WriteVecInsertLd, [HWPort5,HWPort23]> { + let Latency = 6; + let NumMicroOps = 2; +} +def: InstRW<[WriteVecInsertLd], (instregex "(V?)MOV(H|L)(PD|PS)rm")>; + +def : WriteRes<WriteVecExtract, [HWPort0,HWPort5]> { + let Latency = 2; + let NumMicroOps = 2; +} +def : WriteRes<WriteVecExtractSt, [HWPort4,HWPort5,HWPort237]> { + let Latency = 2; + let NumMicroOps = 3; +} + +// String instructions. + +// Packed Compare Implicit Length Strings, Return Mask +def : WriteRes<WritePCmpIStrM, [HWPort0]> { + let Latency = 11; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def : WriteRes<WritePCmpIStrMLd, [HWPort0, HWPort23]> { + let Latency = 17; + let NumMicroOps = 4; + let ResourceCycles = [3,1]; +} + +// Packed Compare Explicit Length Strings, Return Mask +def : WriteRes<WritePCmpEStrM, [HWPort0, HWPort5, HWPort015, HWPort0156]> { + let Latency = 19; + let NumMicroOps = 9; + let ResourceCycles = [4,3,1,1]; +} +def : WriteRes<WritePCmpEStrMLd, [HWPort0, HWPort5, HWPort23, HWPort015, HWPort0156]> { + let Latency = 25; + let NumMicroOps = 10; + let ResourceCycles = [4,3,1,1,1]; +} + +// Packed Compare Implicit Length Strings, Return Index +def : WriteRes<WritePCmpIStrI, [HWPort0]> { + let Latency = 11; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def : WriteRes<WritePCmpIStrILd, [HWPort0, HWPort23]> { + let Latency = 17; + let NumMicroOps = 4; + let ResourceCycles = [3,1]; +} + +// Packed Compare Explicit Length Strings, Return Index +def : WriteRes<WritePCmpEStrI, [HWPort0, HWPort5, HWPort0156]> { + let Latency = 18; + let NumMicroOps = 8; + let ResourceCycles = [4,3,1]; +} +def : WriteRes<WritePCmpEStrILd, [HWPort0, HWPort5, HWPort23, HWPort0156]> { + let Latency = 24; + let NumMicroOps = 9; + let ResourceCycles = [4,3,1,1]; +} + +// MOVMSK Instructions. +def : WriteRes<WriteFMOVMSK, [HWPort0]> { let Latency = 3; } +def : WriteRes<WriteVecMOVMSK, [HWPort0]> { let Latency = 3; } +def : WriteRes<WriteVecMOVMSKY, [HWPort0]> { let Latency = 3; } +def : WriteRes<WriteMMXMOVMSK, [HWPort0]> { let Latency = 1; } + +// AES Instructions. +def : WriteRes<WriteAESDecEnc, [HWPort5]> { + let Latency = 7; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def : WriteRes<WriteAESDecEncLd, [HWPort5, HWPort23]> { + let Latency = 13; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} + +def : WriteRes<WriteAESIMC, [HWPort5]> { + let Latency = 14; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def : WriteRes<WriteAESIMCLd, [HWPort5, HWPort23]> { + let Latency = 20; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} + +def : WriteRes<WriteAESKeyGen, [HWPort0,HWPort5,HWPort015]> { + let Latency = 29; + let NumMicroOps = 11; + let ResourceCycles = [2,7,2]; +} +def : WriteRes<WriteAESKeyGenLd, [HWPort0,HWPort5,HWPort23,HWPort015]> { + let Latency = 34; + let NumMicroOps = 11; + let ResourceCycles = [2,7,1,1]; +} + +// Carry-less multiplication instructions. +def : WriteRes<WriteCLMul, [HWPort0, HWPort5]> { + let Latency = 11; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def : WriteRes<WriteCLMulLd, [HWPort0, HWPort5, HWPort23]> { + let Latency = 17; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} + +// Load/store MXCSR. +def : WriteRes<WriteLDMXCSR, [HWPort0,HWPort23,HWPort0156]> { let Latency = 7; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; } +def : WriteRes<WriteSTMXCSR, [HWPort4,HWPort5,HWPort237]> { let Latency = 2; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; } + +def : WriteRes<WriteSystem, [HWPort0156]> { let Latency = 100; } +def : WriteRes<WriteMicrocoded, [HWPort0156]> { let Latency = 100; } +def : WriteRes<WriteFence, [HWPort23, HWPort4]>; +def : WriteRes<WriteNop, []>; + +//================ Exceptions ================// + +//-- Specific Scheduling Models --// + +// Starting with P0. +def HWWriteP0 : SchedWriteRes<[HWPort0]>; + +def HWWriteP01 : SchedWriteRes<[HWPort01]>; + +def HWWrite2P01 : SchedWriteRes<[HWPort01]> { + let NumMicroOps = 2; +} +def HWWrite3P01 : SchedWriteRes<[HWPort01]> { + let NumMicroOps = 3; +} + +def HWWriteP0156_P23 : SchedWriteRes<[HWPort0156, HWPort23]> { + let NumMicroOps = 2; +} + +def HWWrite2P0156_P23 : SchedWriteRes<[HWPort0156, HWPort23]> { + let NumMicroOps = 3; + let ResourceCycles = [2, 1]; +} + +// Starting with P1. +def HWWriteP1 : SchedWriteRes<[HWPort1]>; + + +def HWWrite2P1 : SchedWriteRes<[HWPort1]> { + let NumMicroOps = 2; + let ResourceCycles = [2]; +} + +// Notation: +// - r: register. +// - mm: 64 bit mmx register. +// - x = 128 bit xmm register. +// - (x)mm = mmx or xmm register. +// - y = 256 bit ymm register. +// - v = any vector register. +// - m = memory. + +//=== Integer Instructions ===// +//-- Move instructions --// + +// XLAT. +def HWWriteXLAT : SchedWriteRes<[]> { + let Latency = 7; + let NumMicroOps = 3; +} +def : InstRW<[HWWriteXLAT], (instrs XLAT)>; + +// PUSHA. +def HWWritePushA : SchedWriteRes<[]> { + let NumMicroOps = 19; +} +def : InstRW<[HWWritePushA], (instregex "PUSHA(16|32)")>; + +// POPA. +def HWWritePopA : SchedWriteRes<[]> { + let NumMicroOps = 18; +} +def : InstRW<[HWWritePopA], (instregex "POPA(16|32)")>; + +//-- Arithmetic instructions --// + +// DIV. +// r8. +def HWWriteDiv8 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { + let Latency = 22; + let NumMicroOps = 9; +} +def : InstRW<[HWWriteDiv8], (instregex "DIV8r")>; + +// IDIV. +// r8. +def HWWriteIDiv8 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { + let Latency = 23; + let NumMicroOps = 9; +} +def : InstRW<[HWWriteIDiv8], (instregex "IDIV8r")>; + +// BT. +// m,r. +def HWWriteBTmr : SchedWriteRes<[]> { + let NumMicroOps = 10; +} +def : InstRW<[HWWriteBTmr], (instregex "BT(16|32|64)mr")>; + +// BTR BTS BTC. +// m,r. +def HWWriteBTRSCmr : SchedWriteRes<[]> { + let NumMicroOps = 11; +} +def : InstRW<[HWWriteBTRSCmr], (instregex "BT(R|S|C)(16|32|64)mr")>; + +//-- Control transfer instructions --// + +// CALL. +// i. +def HWWriteRETI : SchedWriteRes<[HWPort23, HWPort6, HWPort015]> { + let NumMicroOps = 4; + let ResourceCycles = [1, 2, 1]; +} +def : InstRW<[HWWriteRETI], (instregex "RETI(L|Q|W)", "LRETI(L|Q|W)")>; + +// BOUND. +// r,m. +def HWWriteBOUND : SchedWriteRes<[]> { + let NumMicroOps = 15; +} +def : InstRW<[HWWriteBOUND], (instregex "BOUNDS(16|32)rm")>; + +// INTO. +def HWWriteINTO : SchedWriteRes<[]> { + let NumMicroOps = 4; +} +def : InstRW<[HWWriteINTO], (instrs INTO)>; + +//-- String instructions --// + +// LODSB/W. +def : InstRW<[HWWrite2P0156_P23], (instregex "LODS(B|W)")>; + +// LODSD/Q. +def : InstRW<[HWWriteP0156_P23], (instregex "LODS(L|Q)")>; + +// MOVS. +def HWWriteMOVS : SchedWriteRes<[HWPort23, HWPort4, HWPort0156]> { + let Latency = 4; + let NumMicroOps = 5; + let ResourceCycles = [2, 1, 2]; +} +def : InstRW<[HWWriteMOVS], (instrs MOVSB, MOVSL, MOVSQ, MOVSW)>; + +// CMPS. +def HWWriteCMPS : SchedWriteRes<[HWPort23, HWPort0156]> { + let Latency = 4; + let NumMicroOps = 5; + let ResourceCycles = [2, 3]; +} +def : InstRW<[HWWriteCMPS], (instregex "CMPS(B|L|Q|W)")>; + +//-- Other --// + +// RDPMC.f +def HWWriteRDPMC : SchedWriteRes<[]> { + let NumMicroOps = 34; +} +def : InstRW<[HWWriteRDPMC], (instrs RDPMC)>; + +// RDRAND. +def HWWriteRDRAND : SchedWriteRes<[HWPort23, HWPort015]> { + let NumMicroOps = 17; + let ResourceCycles = [1, 16]; +} +def : InstRW<[HWWriteRDRAND], (instregex "RDRAND(16|32|64)r")>; + +//=== Floating Point x87 Instructions ===// +//-- Move instructions --// + +// FLD. +// m80. +def : InstRW<[HWWriteP01], (instregex "LD_Frr")>; + +// FBLD. +// m80. +def HWWriteFBLD : SchedWriteRes<[]> { + let Latency = 47; + let NumMicroOps = 43; +} +def : InstRW<[HWWriteFBLD], (instregex "FBLDm")>; + +// FST(P). +// r. +def : InstRW<[HWWriteP01], (instregex "ST_(F|FP)rr")>; + +// FFREE. +def : InstRW<[HWWriteP01], (instregex "FFREE")>; + +// FNSAVE. +def HWWriteFNSAVE : SchedWriteRes<[]> { + let NumMicroOps = 147; +} +def : InstRW<[HWWriteFNSAVE], (instregex "FSAVEm")>; + +// FRSTOR. +def HWWriteFRSTOR : SchedWriteRes<[]> { + let NumMicroOps = 90; +} +def : InstRW<[HWWriteFRSTOR], (instregex "FRSTORm")>; + +//-- Arithmetic instructions --// + +// FCOMPP FUCOMPP. +// r. +def : InstRW<[HWWrite2P01], (instrs FCOMPP, UCOM_FPPr)>; + +// FCOMI(P) FUCOMI(P). +// m. +def : InstRW<[HWWrite3P01], (instrs COM_FIPr, COM_FIr, UCOM_FIPr, UCOM_FIr)>; + +// FTST. +def : InstRW<[HWWriteP1], (instregex "TST_F")>; + +// FXAM. +def : InstRW<[HWWrite2P1], (instrs FXAM)>; + +// FPREM. +def HWWriteFPREM : SchedWriteRes<[]> { + let Latency = 19; + let NumMicroOps = 28; +} +def : InstRW<[HWWriteFPREM], (instrs FPREM)>; + +// FPREM1. +def HWWriteFPREM1 : SchedWriteRes<[]> { + let Latency = 27; + let NumMicroOps = 41; +} +def : InstRW<[HWWriteFPREM1], (instrs FPREM1)>; + +// FRNDINT. +def HWWriteFRNDINT : SchedWriteRes<[]> { + let Latency = 11; + let NumMicroOps = 17; +} +def : InstRW<[HWWriteFRNDINT], (instrs FRNDINT)>; + +//-- Math instructions --// + +// FSCALE. +def HWWriteFSCALE : SchedWriteRes<[]> { + let Latency = 75; // 49-125 + let NumMicroOps = 50; // 25-75 +} +def : InstRW<[HWWriteFSCALE], (instrs FSCALE)>; + +// FXTRACT. +def HWWriteFXTRACT : SchedWriteRes<[]> { + let Latency = 15; + let NumMicroOps = 17; +} +def : InstRW<[HWWriteFXTRACT], (instrs FXTRACT)>; + +//////////////////////////////////////////////////////////////////////////////// +// Horizontal add/sub instructions. +//////////////////////////////////////////////////////////////////////////////// + +defm : HWWriteResPair<WriteFHAdd, [HWPort1, HWPort5], 5, [1,2], 3, 6>; +defm : HWWriteResPair<WriteFHAddY, [HWPort1, HWPort5], 5, [1,2], 3, 7>; +defm : HWWriteResPair<WritePHAdd, [HWPort5, HWPort15], 3, [2,1], 3, 5>; +defm : HWWriteResPair<WritePHAddX, [HWPort5, HWPort15], 3, [2,1], 3, 6>; +defm : HWWriteResPair<WritePHAddY, [HWPort5, HWPort15], 3, [2,1], 3, 7>; + +//=== Floating Point XMM and YMM Instructions ===// + +// Remaining instrs. + +def HWWriteResGroup0 : SchedWriteRes<[HWPort23]> { + let Latency = 6; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup0], (instregex "VBROADCASTSSrm", + "(V?)MOVSHDUPrm", + "(V?)MOVSLDUPrm", + "VPBROADCAST(D|Q)rm")>; + +def HWWriteResGroup0_1 : SchedWriteRes<[HWPort23]> { + let Latency = 7; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup0_1], (instregex "LD_F(32|64|80)m", + "VBROADCASTF128", + "VBROADCASTI128", + "VBROADCASTSDYrm", + "VBROADCASTSSYrm", + "VMOVDDUPYrm", + "VMOVSHDUPYrm", + "VMOVSLDUPYrm", + "VPBROADCAST(D|Q)Yrm")>; + +def HWWriteResGroup0_2 : SchedWriteRes<[HWPort23]> { + let Latency = 5; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup0_2], (instregex "MOVSX(16|32|64)rm16", + "MOVSX(16|32|64)rm32", + "MOVSX(16|32|64)rm8", + "MOVZX(16|32|64)rm16", + "MOVZX(16|32|64)rm8", + "(V?)MOVDDUPrm")>; + +def HWWriteResGroup1 : SchedWriteRes<[HWPort4,HWPort237]> { + let Latency = 1; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup1], (instregex "FBSTPm", + "ST_FP(32|64|80)m", + "VMPTRSTm")>; + +def HWWriteResGroup2 : SchedWriteRes<[HWPort0]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup2], (instregex "VPSLLVQ(Y?)rr", + "VPSRLVQ(Y?)rr")>; + +def HWWriteResGroup3 : SchedWriteRes<[HWPort1]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup3], (instregex "COM(P?)_FST0r", + "UCOM_F(P?)r")>; + +def HWWriteResGroup4 : SchedWriteRes<[HWPort5]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup4], (instregex "MMX_MOVQ2DQrr")>; + +def HWWriteResGroup5 : SchedWriteRes<[HWPort6]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup5], (instregex "JMP(16|32|64)r")>; + +def HWWriteResGroup6 : SchedWriteRes<[HWPort01]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup6], (instrs FINCSTP, FNOP)>; + +def HWWriteResGroup7 : SchedWriteRes<[HWPort06]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup7], (instrs CDQ, CQO)>; + +def HWWriteResGroup8 : SchedWriteRes<[HWPort15]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup8], (instregex "ANDN(32|64)rr", + "BLSI(32|64)rr", + "BLSMSK(32|64)rr", + "BLSR(32|64)rr")>; + +def HWWriteResGroup9 : SchedWriteRes<[HWPort015]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup9], (instregex "VPBLENDD(Y?)rri")>; + +def HWWriteResGroup10 : SchedWriteRes<[HWPort0156]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup10], (instrs CBW, CWDE, CDQE, + CMC, STC)>; +def: InstRW<[HWWriteResGroup10], (instregex "SGDT64m", + "SIDT64m", + "SMSW16m", + "STRm", + "SYSCALL")>; + +def HWWriteResGroup11 : SchedWriteRes<[HWPort0,HWPort23]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup11], (instregex "(V?)CVTPS2PDrm")>; + +def HWWriteResGroup11_1 : SchedWriteRes<[HWPort0,HWPort23]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup11_1], (instregex "(V?)CVTSS2SDrm", + "VPSLLVQrm", + "VPSRLVQrm")>; + +def HWWriteResGroup11_2 : SchedWriteRes<[HWPort0,HWPort23]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup11_2], (instregex "VPSLLVQYrm", + "VPSRLVQYrm")>; + +def HWWriteResGroup12 : SchedWriteRes<[HWPort1,HWPort23]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup12], (instregex "MMX_CVTPI2PSirm", + "PDEP(32|64)rm", + "PEXT(32|64)rm")>; + +def HWWriteResGroup12_1 : SchedWriteRes<[HWPort1,HWPort0156,HWPort23]> { + let Latency = 8; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup12_1], (instrs IMUL16rmi, IMUL16rmi8)>; + +def HWWriteResGroup12_2 : SchedWriteRes<[HWPort1,HWPort06,HWPort0156,HWPort23]> { + let Latency = 9; + let NumMicroOps = 5; + let ResourceCycles = [1,1,2,1]; +} +def: InstRW<[HWWriteResGroup12_2], (instrs IMUL16m, MUL16m)>; + +def HWWriteResGroup13 : SchedWriteRes<[HWPort5,HWPort23]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup13], (instregex "(V?)PMOV(SX|ZX)BDrm", + "(V?)PMOV(SX|ZX)BQrm", + "(V?)PMOV(SX|ZX)BWrm", + "(V?)PMOV(SX|ZX)DQrm", + "(V?)PMOV(SX|ZX)WDrm", + "(V?)PMOV(SX|ZX)WQrm")>; + +def HWWriteResGroup13_1 : SchedWriteRes<[HWPort5,HWPort23]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup13_1], (instregex "VPMOVSXBDYrm", + "VPMOVSXBQYrm", + "VPMOVSXWQYrm")>; + +def HWWriteResGroup14 : SchedWriteRes<[HWPort6,HWPort23]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup14], (instregex "FARJMP64", + "JMP(16|32|64)m")>; + +def HWWriteResGroup15 : SchedWriteRes<[HWPort23,HWPort06]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup15], (instregex "BT(16|32|64)mi8")>; + +def HWWriteResGroup16 : SchedWriteRes<[HWPort23,HWPort15]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup16], (instregex "ANDN(32|64)rm", + "BLSI(32|64)rm", + "BLSMSK(32|64)rm", + "BLSR(32|64)rm", + "MOVBE(16|32|64)rm")>; + +def HWWriteResGroup17 : SchedWriteRes<[HWPort23,HWPort015]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup17], (instregex "VINSERTF128rm", + "VINSERTI128rm", + "VPBLENDDrmi")>; + +def HWWriteResGroup17_2 : SchedWriteRes<[HWPort23,HWPort015]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup17_2], (instregex "VPBLENDDYrmi")>; + +def HWWriteResGroup18 : SchedWriteRes<[HWPort23,HWPort0156]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup18], (instrs POP16r, POP32r, POP64r)>; +def: InstRW<[HWWriteResGroup18], (instregex "POP(16|32|64)rmr")>; + +def HWWriteResGroup19 : SchedWriteRes<[HWPort237,HWPort0156]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup19], (instrs SFENCE)>; + +def HWWriteResGroup21 : SchedWriteRes<[HWPort4,HWPort6,HWPort237]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup21], (instrs FNSTCW16m)>; + +def HWWriteResGroup23 : SchedWriteRes<[HWPort4,HWPort237,HWPort15]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup23], (instregex "MOVBE(32|64)mr")>; + +def HWWriteResGroup23_16 : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup23_16], (instrs MOVBE16mr)>; + +def HWWriteResGroup24 : SchedWriteRes<[HWPort4,HWPort237,HWPort0156]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup24], (instrs PUSH16r, PUSH32r, PUSH64r, + STOSB, STOSL, STOSQ, STOSW)>; +def: InstRW<[HWWriteResGroup24], (instregex "PUSH(16|32|64)rmr", + "PUSH64i8")>; + +def HWWriteResGroup25 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort06]> { + let Latency = 7; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[HWWriteResGroup25], (instregex "BTC(16|32|64)mi8", + "BTR(16|32|64)mi8", + "BTS(16|32|64)mi8", + "SAR(8|16|32|64)m1", + "SAR(8|16|32|64)mi", + "SHL(8|16|32|64)m1", + "SHL(8|16|32|64)mi", + "SHR(8|16|32|64)m1", + "SHR(8|16|32|64)mi")>; + +def HWWriteResGroup26 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort0156]> { + let Latency = 7; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[HWWriteResGroup26], (instregex "POP(16|32|64)rmm", + "PUSH(16|32|64)rmm")>; + +def HWWriteResGroup28 : SchedWriteRes<[HWPort01]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[HWWriteResGroup28], (instrs FDECSTP)>; + +def HWWriteResGroup29 : SchedWriteRes<[HWPort06]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[HWWriteResGroup29], (instregex "ROL(8|16|32|64)r1", + "ROL(8|16|32|64)ri", + "ROR(8|16|32|64)r1", + "ROR(8|16|32|64)ri")>; + +def HWWriteResGroup30 : SchedWriteRes<[HWPort0156]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[HWWriteResGroup30], (instrs LFENCE, + MFENCE, + WAIT, + XGETBV)>; + +def HWWriteResGroup31 : SchedWriteRes<[HWPort0,HWPort5]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup31], (instregex "(V?)CVTPS2PDrr", + "(V?)CVTSS2SDrr")>; + +def HWWriteResGroup32 : SchedWriteRes<[HWPort6,HWPort0156]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup32], (instregex "CLFLUSH")>; + +def HWWriteResGroup33 : SchedWriteRes<[HWPort01,HWPort015]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup33], (instregex "MMX_MOVDQ2Qrr")>; + +def HWWriteResGroup35 : SchedWriteRes<[HWPort06,HWPort0156]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup35], (instrs CWD, JCXZ, JECXZ, JRCXZ)>; +def: InstRW<[HWWriteResGroup35], (instregex "SET(A|BE)r")>; + +def HWWriteResGroup36_2 : SchedWriteRes<[HWPort5,HWPort23]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[HWWriteResGroup36_2], (instregex "MMX_PACKSSDWirm", + "MMX_PACKSSWBirm", + "MMX_PACKUSWBirm")>; + +def HWWriteResGroup37 : SchedWriteRes<[HWPort23,HWPort0156]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[HWWriteResGroup37], (instrs LEAVE, LEAVE64, + SCASB, SCASL, SCASQ, SCASW)>; + +def HWWriteResGroup39 : SchedWriteRes<[HWPort0,HWPort01,HWPort23]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup39], (instrs FLDCW16m)>; + +def HWWriteResGroup41 : SchedWriteRes<[HWPort6,HWPort23,HWPort0156]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup41], (instrs LRETQ, RETL, RETQ)>; + +def HWWriteResGroup44 : SchedWriteRes<[HWPort4,HWPort6,HWPort237,HWPort0156]> { + let Latency = 3; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[HWWriteResGroup44], (instregex "CALL(16|32|64)r")>; + +def HWWriteResGroup45 : SchedWriteRes<[HWPort4,HWPort237,HWPort06,HWPort0156]> { + let Latency = 3; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[HWWriteResGroup45], (instrs CALL64pcrel32)>; +def: InstRW<[HWWriteResGroup45], (instregex "SET(A|BE)m")>; + +def HWWriteResGroup46 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort06]> { + let Latency = 8; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,2]; +} +def: InstRW<[HWWriteResGroup46], (instregex "ROL(8|16|32|64)m1", + "ROL(8|16|32|64)mi", + "ROR(8|16|32|64)m1", + "ROR(8|16|32|64)mi")>; + +def HWWriteResGroup47 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort0156]> { + let Latency = 8; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,2]; +} +def: InstRW<[HWWriteResGroup47], (instregex "XADD(8|16|32|64)rm")>; + +def HWWriteResGroup48 : SchedWriteRes<[HWPort4,HWPort6,HWPort23,HWPort237,HWPort0156]> { + let Latency = 8; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,1,1]; +} +def: InstRW<[HWWriteResGroup48], (instregex "CALL(16|32|64)m", + "FARCALL64")>; + +def HWWriteResGroup50 : SchedWriteRes<[HWPort1]> { + let Latency = 3; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup50], (instregex "MMX_CVTPI2PSirr", + "PDEP(32|64)rr", + "PEXT(32|64)rr", + "(V?)CVTDQ2PS(Y?)rr")>; + +def HWWriteResGroup50_16i : SchedWriteRes<[HWPort1, HWPort0156]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup50_16i], (instrs IMUL16rri, IMUL16rri8)>; + +def HWWriteResGroup51 : SchedWriteRes<[HWPort5]> { + let Latency = 3; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup51], (instregex "VPBROADCAST(B|W)rr")>; + +def HWWriteResGroup52 : SchedWriteRes<[HWPort1,HWPort23]> { + let Latency = 9; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup52], (instregex "(V?)CVTPS2DQrm", + "(V?)CVTTPS2DQrm")>; + +def HWWriteResGroup52_1 : SchedWriteRes<[HWPort1,HWPort23]> { + let Latency = 10; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup52_1], (instregex "(ADD|SUB|SUBR)_F(32|64)m", + "ILD_F(16|32|64)m", + "VCVTDQ2PSYrm", + "VCVTPS2DQYrm", + "VCVTTPS2DQYrm")>; + +def HWWriteResGroup53_1 : SchedWriteRes<[HWPort5,HWPort23]> { + let Latency = 9; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup53_1], (instregex "VPMOVSXBWYrm", + "VPMOVSXDQYrm", + "VPMOVSXWDYrm", + "VPMOVZXWDYrm")>; + +def HWWriteResGroup54 : SchedWriteRes<[HWPort0156]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def: InstRW<[HWWriteResGroup54], (instrs XADD8rr, XADD16rr, XADD32rr, XADD64rr, + XCHG8rr, XCHG16rr, XCHG32rr, XCHG64rr, + XCHG16ar, XCHG32ar, XCHG64ar)>; + +def HWWriteResGroup57 : SchedWriteRes<[HWPort5,HWPort0156]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[HWWriteResGroup57], (instregex "MMX_PACKSSDWirr", + "MMX_PACKSSWBirr", + "MMX_PACKUSWBirr")>; + +def HWWriteResGroup58 : SchedWriteRes<[HWPort6,HWPort0156]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[HWWriteResGroup58], (instregex "CLD")>; + +def HWWriteResGroup59 : SchedWriteRes<[HWPort06,HWPort0156]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[HWWriteResGroup59], (instregex "RCL(8|16|32|64)r1", + "RCL(8|16|32|64)ri", + "RCR(8|16|32|64)r1", + "RCR(8|16|32|64)ri")>; + +def HWWriteResGroup60 : SchedWriteRes<[HWPort06,HWPort0156]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[HWWriteResGroup60], (instregex "ROL(8|16|32|64)rCL", + "ROR(8|16|32|64)rCL", + "SAR(8|16|32|64)rCL", + "SHL(8|16|32|64)rCL", + "SHR(8|16|32|64)rCL")>; + +def HWWriteResGroup61 : SchedWriteRes<[HWPort0,HWPort4,HWPort237]> { + let Latency = 4; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup61], (instrs FNSTSWm)>; + +def HWWriteResGroup62 : SchedWriteRes<[HWPort1,HWPort4,HWPort237]> { + let Latency = 4; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup62], (instregex "IST(T?)_FP(16|32|64)m", + "IST_F(16|32)m")>; + +def HWWriteResGroup66 : SchedWriteRes<[HWPort23,HWPort237,HWPort06,HWPort0156]> { + let Latency = 9; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,2]; +} +def: InstRW<[HWWriteResGroup66], (instregex "RCL(8|16|32|64)m1", + "RCL(8|16|32|64)mi", + "RCR(8|16|32|64)m1", + "RCR(8|16|32|64)mi")>; + +def HWWriteResGroup67 : SchedWriteRes<[HWPort23,HWPort237,HWPort06,HWPort0156]> { + let Latency = 9; + let NumMicroOps = 5; + let ResourceCycles = [1,1,2,1]; +} +def: InstRW<[HWWriteResGroup67], (instregex "ROR(8|16|32|64)mCL")>; + +def HWWriteResGroup68 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort0156]> { + let Latency = 9; + let NumMicroOps = 6; + let ResourceCycles = [1,1,1,3]; +} +def: InstRW<[HWWriteResGroup68], (instregex "XCHG(8|16|32|64)rm")>; + +def HWWriteResGroup69 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort06,HWPort0156]> { + let Latency = 9; + let NumMicroOps = 6; + let ResourceCycles = [1,1,1,2,1]; +} +def: InstRW<[HWWriteResGroup69], (instregex "CMPXCHG(8|16|32|64)rm", + "ROL(8|16|32|64)mCL", + "SAR(8|16|32|64)mCL", + "SHL(8|16|32|64)mCL", + "SHR(8|16|32|64)mCL")>; +def: SchedAlias<WriteADCRMW, HWWriteResGroup69>; + +def HWWriteResGroup70 : SchedWriteRes<[HWPort0,HWPort1]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup70], (instregex "(V?)CVT(T?)SD2SI(64)?rr", + "(V?)CVT(T?)SS2SI(64)?rr")>; + +def HWWriteResGroup71 : SchedWriteRes<[HWPort0,HWPort5]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup71], (instregex "VCVTPS2PDYrr")>; + +def HWWriteResGroup72 : SchedWriteRes<[HWPort0,HWPort0156]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup72], (instrs FNSTSW16r)>; + +def HWWriteResGroup73 : SchedWriteRes<[HWPort1,HWPort5]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup73], (instregex "MMX_CVTPI2PDirr", + "MMX_CVT(T?)PD2PIirr", + "MMX_CVT(T?)PS2PIirr", + "(V?)CVTDQ2PDrr", + "(V?)CVTPD2PSrr", + "(V?)CVTSD2SSrr", + "(V?)CVTSI(64)?2SDrr", + "(V?)CVTSI2SSrr", + "(V?)CVT(T?)PD2DQrr")>; + +def HWWriteResGroup74 : SchedWriteRes<[HWPort1,HWPort6]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup74], (instrs IMUL64r, MUL64r, MULX64rr)>; + +def HWWriteResGroup74_16 : SchedWriteRes<[HWPort1, HWPort06, HWPort0156]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; +} +def: InstRW<[HWWriteResGroup74_16], (instrs IMUL16r, MUL16r)>; + +def HWWriteResGroup75 : SchedWriteRes<[HWPort1,HWPort23]> { + let Latency = 11; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[HWWriteResGroup75], (instregex "FICOM(P?)(16|32)m")>; + +def HWWriteResGroup76 : SchedWriteRes<[HWPort0,HWPort1,HWPort23]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup76], (instregex "(V?)CVTSD2SI(64)?rm", + "(V?)CVTSS2SI(64)?rm", + "(V?)CVTTSD2SI(64)?rm", + "VCVTTSS2SI64rm", + "(V?)CVTTSS2SIrm")>; + +def HWWriteResGroup77 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> { + let Latency = 10; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup77], (instregex "VCVTPS2PDYrm")>; + +def HWWriteResGroup78 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> { + let Latency = 10; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup78], (instregex "CVTPD2PSrm", + "CVT(T?)PD2DQrm", + "MMX_CVT(T?)PD2PIirm", + "(V?)CVTDQ2PDrm")>; + +def HWWriteResGroup78_1 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup78_1], (instregex "MMX_CVTPI2PDirm", + "(V?)CVTSD2SSrm")>; + +def HWWriteResGroup79 : SchedWriteRes<[HWPort1,HWPort6,HWPort23]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup79], (instrs IMUL64m, MUL64m, MULX64rm)>; + +def HWWriteResGroup80 : SchedWriteRes<[HWPort5,HWPort23,HWPort015]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup80], (instregex "VPBROADCAST(B|W)(Y?)rm")>; + +def HWWriteResGroup81 : SchedWriteRes<[HWPort0156]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [4]; +} +def: InstRW<[HWWriteResGroup81], (instrs FNCLEX)>; + +def HWWriteResGroup82 : SchedWriteRes<[HWPort015,HWPort0156]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [1,3]; +} +def: InstRW<[HWWriteResGroup82], (instrs VZEROUPPER)>; + +def HWWriteResGroup83 : SchedWriteRes<[HWPort1,HWPort6,HWPort0156]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; +} +def: InstRW<[HWWriteResGroup83], (instregex "LAR(16|32|64)rr")>; + +def HWWriteResGroup87 : SchedWriteRes<[HWPort1,HWPort6,HWPort23,HWPort0156]> { + let Latency = 9; + let NumMicroOps = 5; + let ResourceCycles = [1,2,1,1]; +} +def: InstRW<[HWWriteResGroup87], (instregex "LAR(16|32|64)rm", + "LSL(16|32|64)rm")>; + +def HWWriteResGroup88 : SchedWriteRes<[HWPort4,HWPort237,HWPort0156]> { + let Latency = 5; + let NumMicroOps = 6; + let ResourceCycles = [1,1,4]; +} +def: InstRW<[HWWriteResGroup88], (instregex "PUSHF(16|64)")>; + +def HWWriteResGroup89 : SchedWriteRes<[HWPort0]> { + let Latency = 5; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup89], (instregex "(V?)PCMPGTQ(Y?)rr", + "MUL_(FPrST0|FST0r|FrST0)")>; + +def HWWriteResGroup91_2 : SchedWriteRes<[HWPort0,HWPort23]> { + let Latency = 11; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup91_2], (instregex "(V?)PCMPGTQrm")>; + +def HWWriteResGroup91_3 : SchedWriteRes<[HWPort0,HWPort23]> { + let Latency = 12; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup91_3], (instregex "MUL_F(32|64)m", + "VPCMPGTQYrm")>; + +def HWWriteResGroup93 : SchedWriteRes<[HWPort1,HWPort5]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[HWWriteResGroup93], (instregex "(V?)CVTSI642SSrr")>; + +def HWWriteResGroup94 : SchedWriteRes<[HWPort1,HWPort6,HWPort06]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup94], (instregex "STR(16|32|64)r")>; + +def HWWriteResGroup95 : SchedWriteRes<[HWPort1,HWPort06,HWPort0156]> { + let Latency = 4; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup95], (instrs IMUL32r, MUL32r, MULX32rr)>; + +def HWWriteResGroup97 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort23]> { + let Latency = 10; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[HWWriteResGroup97], (instregex "CVTTSS2SI64rm")>; + +def HWWriteResGroup98 : SchedWriteRes<[HWPort1,HWPort23,HWPort06,HWPort0156]> { + let Latency = 9; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[HWWriteResGroup98], (instrs IMUL32m, MUL32m, MULX32rm)>; + +def HWWriteResGroup99 : SchedWriteRes<[HWPort6,HWPort0156]> { + let Latency = 5; + let NumMicroOps = 5; + let ResourceCycles = [1,4]; +} +def: InstRW<[HWWriteResGroup99], (instrs PAUSE)>; + +def HWWriteResGroup100 : SchedWriteRes<[HWPort06,HWPort0156]> { + let Latency = 5; + let NumMicroOps = 5; + let ResourceCycles = [1,4]; +} +def: InstRW<[HWWriteResGroup100], (instrs XSETBV)>; + +def HWWriteResGroup101 : SchedWriteRes<[HWPort06,HWPort0156]> { + let Latency = 5; + let NumMicroOps = 5; + let ResourceCycles = [2,3]; +} +def: InstRW<[HWWriteResGroup101], (instregex "CMPXCHG(8|16|32|64)rr")>; + +def HWWriteResGroup102 : SchedWriteRes<[HWPort1,HWPort5]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup102], (instregex "VCVTDQ2PDYrr", + "VCVTPD2PSYrr", + "VCVT(T?)PD2DQYrr")>; + +def HWWriteResGroup103 : SchedWriteRes<[HWPort1,HWPort23]> { + let Latency = 13; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[HWWriteResGroup103], (instregex "(ADD|SUB|SUBR)_FI(16|32)m")>; + +def HWWriteResGroup104 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> { + let Latency = 12; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup104], (instregex "VCVTDQ2PDYrm")>; + +def HWWriteResGroup107 : SchedWriteRes<[HWPort1,HWPort6,HWPort06,HWPort0156]> { + let Latency = 6; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[HWWriteResGroup107], (instregex "SLDT(16|32|64)r")>; + +def HWWriteResGroup108 : SchedWriteRes<[HWPort6,HWPort0156]> { + let Latency = 6; + let NumMicroOps = 6; + let ResourceCycles = [1,5]; +} +def: InstRW<[HWWriteResGroup108], (instrs STD)>; + +def HWWriteResGroup114 : SchedWriteRes<[HWPort6,HWPort06,HWPort15,HWPort0156]> { + let Latency = 7; + let NumMicroOps = 7; + let ResourceCycles = [2,2,1,2]; +} +def: InstRW<[HWWriteResGroup114], (instrs LOOP)>; + +def HWWriteResGroup115 : SchedWriteRes<[HWPort0,HWPort1,HWPort23]> { + let Latency = 15; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup115], (instregex "MUL_FI(16|32)m")>; + +def HWWriteResGroup120 : SchedWriteRes<[HWPort1,HWPort23,HWPort237,HWPort06,HWPort15,HWPort0156]> { + let Latency = 16; + let NumMicroOps = 10; + let ResourceCycles = [1,1,1,4,1,2]; +} +def: InstRW<[HWWriteResGroup120], (instregex "RCL(8|16|32|64)mCL")>; + +def HWWriteResGroup129 : SchedWriteRes<[HWPort1,HWPort06,HWPort0156]> { + let Latency = 11; + let NumMicroOps = 7; + let ResourceCycles = [2,2,3]; +} +def: InstRW<[HWWriteResGroup129], (instregex "RCL(16|32|64)rCL", + "RCR(16|32|64)rCL")>; + +def HWWriteResGroup130 : SchedWriteRes<[HWPort1,HWPort06,HWPort15,HWPort0156]> { + let Latency = 11; + let NumMicroOps = 9; + let ResourceCycles = [1,4,1,3]; +} +def: InstRW<[HWWriteResGroup130], (instregex "RCL8rCL")>; + +def HWWriteResGroup131 : SchedWriteRes<[HWPort06,HWPort0156]> { + let Latency = 11; + let NumMicroOps = 11; + let ResourceCycles = [2,9]; +} +def: InstRW<[HWWriteResGroup131], (instrs LOOPE, LOOPNE)>; + +def HWWriteResGroup132 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort06,HWPort15,HWPort0156]> { + let Latency = 17; + let NumMicroOps = 14; + let ResourceCycles = [1,1,1,4,2,5]; +} +def: InstRW<[HWWriteResGroup132], (instrs CMPXCHG8B)>; + +def HWWriteResGroup135 : SchedWriteRes<[HWPort1,HWPort23,HWPort237,HWPort06,HWPort15,HWPort0156]> { + let Latency = 19; + let NumMicroOps = 11; + let ResourceCycles = [2,1,1,3,1,3]; +} +def: InstRW<[HWWriteResGroup135], (instregex "RCR(8|16|32|64)mCL")>; + +def HWWriteResGroup142 : SchedWriteRes<[HWPort1,HWPort06,HWPort15,HWPort0156]> { + let Latency = 14; + let NumMicroOps = 10; + let ResourceCycles = [2,3,1,4]; +} +def: InstRW<[HWWriteResGroup142], (instregex "RCR8rCL")>; + +def HWWriteResGroup143 : SchedWriteRes<[HWPort23,HWPort0156]> { + let Latency = 19; + let NumMicroOps = 15; + let ResourceCycles = [1,14]; +} +def: InstRW<[HWWriteResGroup143], (instregex "POPF16")>; + +def HWWriteResGroup144 : SchedWriteRes<[HWPort4,HWPort5,HWPort6,HWPort23,HWPort237,HWPort06,HWPort0156]> { + let Latency = 21; + let NumMicroOps = 8; + let ResourceCycles = [1,1,1,1,1,1,2]; +} +def: InstRW<[HWWriteResGroup144], (instrs INSB, INSL, INSW)>; + +def HWWriteResGroup145 : SchedWriteRes<[HWPort5]> { + let Latency = 16; + let NumMicroOps = 16; + let ResourceCycles = [16]; +} +def: InstRW<[HWWriteResGroup145], (instrs VZEROALL)>; + +def HWWriteResGroup146 : SchedWriteRes<[HWPort0,HWPort4,HWPort5,HWPort23,HWPort237,HWPort06,HWPort0156]> { + let Latency = 22; + let NumMicroOps = 19; + let ResourceCycles = [2,1,4,1,1,4,6]; +} +def: InstRW<[HWWriteResGroup146], (instrs CMPXCHG16B)>; + +def HWWriteResGroup147 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort6,HWPort01,HWPort0156]> { + let Latency = 17; + let NumMicroOps = 15; + let ResourceCycles = [2,1,2,4,2,4]; +} +def: InstRW<[HWWriteResGroup147], (instrs XCH_F)>; + +def HWWriteResGroup149 : SchedWriteRes<[HWPort5,HWPort6,HWPort06,HWPort0156]> { + let Latency = 18; + let NumMicroOps = 8; + let ResourceCycles = [1,1,1,5]; +} +def: InstRW<[HWWriteResGroup149], (instrs CPUID, RDTSC)>; + +def HWWriteResGroup151 : SchedWriteRes<[HWPort6,HWPort23,HWPort0156]> { + let Latency = 23; + let NumMicroOps = 19; + let ResourceCycles = [3,1,15]; +} +def: InstRW<[HWWriteResGroup151], (instregex "XRSTOR(64)?")>; + +def HWWriteResGroup154 : SchedWriteRes<[HWPort0]> { + let Latency = 20; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup154], (instregex "DIV_(FPrST0|FST0r|FrST0)")>; + +def HWWriteResGroup155 : SchedWriteRes<[HWPort0,HWPort23]> { + let Latency = 27; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup155], (instregex "DIVR_F(32|64)m")>; + +def HWWriteResGroup156 : SchedWriteRes<[HWPort5,HWPort6,HWPort0156]> { + let Latency = 20; + let NumMicroOps = 10; + let ResourceCycles = [1,2,7]; +} +def: InstRW<[HWWriteResGroup156], (instrs MWAITrr)>; + +def HWWriteResGroup161 : SchedWriteRes<[HWPort0,HWPort1,HWPort23]> { + let Latency = 30; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup161], (instregex "DIVR_FI(16|32)m")>; + +def HWWriteResGroup162 : SchedWriteRes<[HWPort0]> { + let Latency = 24; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup162], (instregex "DIVR_(FPrST0|FST0r|FrST0)")>; + +def HWWriteResGroup163 : SchedWriteRes<[HWPort0,HWPort23]> { + let Latency = 31; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup163], (instregex "DIV_F(32|64)m")>; + +def HWWriteResGroup164 : SchedWriteRes<[HWPort4,HWPort6,HWPort23,HWPort237,HWPort0156]> { + let Latency = 30; + let NumMicroOps = 27; + let ResourceCycles = [1,5,1,1,19]; +} +def: InstRW<[HWWriteResGroup164], (instrs XSAVE64)>; + +def HWWriteResGroup165 : SchedWriteRes<[HWPort4,HWPort6,HWPort23,HWPort237,HWPort0156]> { + let Latency = 31; + let NumMicroOps = 28; + let ResourceCycles = [1,6,1,1,19]; +} +def: InstRW<[HWWriteResGroup165], (instrs XSAVE)>; +def: InstRW<[HWWriteResGroup165], (instregex "XSAVEC", "XSAVES", "XSAVEOPT")>; + +def HWWriteResGroup166 : SchedWriteRes<[HWPort0,HWPort1,HWPort23]> { + let Latency = 34; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup166], (instregex "DIV_FI(16|32)m")>; + +def HWWriteResGroup170 : SchedWriteRes<[HWPort5,HWPort6,HWPort23,HWPort06,HWPort0156]> { + let Latency = 35; + let NumMicroOps = 23; + let ResourceCycles = [1,5,3,4,10]; +} +def: InstRW<[HWWriteResGroup170], (instregex "IN(8|16|32)ri", + "IN(8|16|32)rr")>; + +def HWWriteResGroup171 : SchedWriteRes<[HWPort5,HWPort6,HWPort23,HWPort237,HWPort06,HWPort0156]> { + let Latency = 36; + let NumMicroOps = 23; + let ResourceCycles = [1,5,2,1,4,10]; +} +def: InstRW<[HWWriteResGroup171], (instregex "OUT(8|16|32)ir", + "OUT(8|16|32)rr")>; + +def HWWriteResGroup175 : SchedWriteRes<[HWPort1,HWPort4,HWPort5,HWPort6,HWPort23,HWPort237,HWPort15,HWPort0156]> { + let Latency = 41; + let NumMicroOps = 18; + let ResourceCycles = [1,1,2,3,1,1,1,8]; +} +def: InstRW<[HWWriteResGroup175], (instrs VMCLEARm)>; + +def HWWriteResGroup176 : SchedWriteRes<[HWPort5,HWPort0156]> { + let Latency = 42; + let NumMicroOps = 22; + let ResourceCycles = [2,20]; +} +def: InstRW<[HWWriteResGroup176], (instrs RDTSCP)>; + +def HWWriteResGroup177 : SchedWriteRes<[HWPort0,HWPort01,HWPort23,HWPort05,HWPort06,HWPort015,HWPort0156]> { + let Latency = 61; + let NumMicroOps = 64; + let ResourceCycles = [2,2,8,1,10,2,39]; +} +def: InstRW<[HWWriteResGroup177], (instrs FLDENVm)>; + +def HWWriteResGroup178 : SchedWriteRes<[HWPort0,HWPort6,HWPort23,HWPort05,HWPort06,HWPort15,HWPort0156]> { + let Latency = 64; + let NumMicroOps = 88; + let ResourceCycles = [4,4,31,1,2,1,45]; +} +def: InstRW<[HWWriteResGroup178], (instrs FXRSTOR64)>; + +def HWWriteResGroup179 : SchedWriteRes<[HWPort0,HWPort6,HWPort23,HWPort05,HWPort06,HWPort15,HWPort0156]> { + let Latency = 64; + let NumMicroOps = 90; + let ResourceCycles = [4,2,33,1,2,1,47]; +} +def: InstRW<[HWWriteResGroup179], (instrs FXRSTOR)>; + +def HWWriteResGroup180 : SchedWriteRes<[HWPort5,HWPort01,HWPort0156]> { + let Latency = 75; + let NumMicroOps = 15; + let ResourceCycles = [6,3,6]; +} +def: InstRW<[HWWriteResGroup180], (instrs FNINIT)>; + +def HWWriteResGroup181 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort6,HWPort01,HWPort0156]> { + let Latency = 98; + let NumMicroOps = 32; + let ResourceCycles = [7,7,3,3,1,11]; +} +def: InstRW<[HWWriteResGroup181], (instregex "DIV(16|32|64)r")>; + +def HWWriteResGroup182 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort6,HWPort06,HWPort0156]> { + let Latency = 112; + let NumMicroOps = 66; + let ResourceCycles = [4,2,4,8,14,34]; +} +def: InstRW<[HWWriteResGroup182], (instregex "IDIV(16|32|64)r")>; + +def HWWriteResGroup183 : SchedWriteRes<[HWPort0,HWPort1,HWPort4,HWPort5,HWPort6,HWPort237,HWPort06,HWPort0156]> { + let Latency = 115; + let NumMicroOps = 100; + let ResourceCycles = [9,9,11,8,1,11,21,30]; +} +def: InstRW<[HWWriteResGroup183], (instrs FSTENVm)>; + +def HWWriteResGroup184 : SchedWriteRes<[HWPort0, HWPort5, HWPort15, HWPort015, HWPort06, HWPort23]> { + let Latency = 26; + let NumMicroOps = 12; + let ResourceCycles = [2,2,1,3,2,2]; +} +def: InstRW<[HWWriteResGroup184], (instrs VGATHERDPDrm, + VPGATHERDQrm, + VPGATHERDDrm)>; + +def HWWriteResGroup185 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> { + let Latency = 24; + let NumMicroOps = 22; + let ResourceCycles = [5,3,4,1,5,4]; +} +def: InstRW<[HWWriteResGroup185], (instrs VGATHERQPDYrm, + VPGATHERQQYrm)>; + +def HWWriteResGroup186 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> { + let Latency = 28; + let NumMicroOps = 22; + let ResourceCycles = [5,3,4,1,5,4]; +} +def: InstRW<[HWWriteResGroup186], (instrs VPGATHERQDYrm)>; + +def HWWriteResGroup187 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> { + let Latency = 25; + let NumMicroOps = 22; + let ResourceCycles = [5,3,4,1,5,4]; +} +def: InstRW<[HWWriteResGroup187], (instrs VPGATHERQDrm)>; + +def HWWriteResGroup188 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> { + let Latency = 27; + let NumMicroOps = 20; + let ResourceCycles = [3,3,4,1,5,4]; +} +def: InstRW<[HWWriteResGroup188], (instrs VGATHERDPDYrm, + VPGATHERDQYrm)>; + +def HWWriteResGroup189 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> { + let Latency = 27; + let NumMicroOps = 34; + let ResourceCycles = [5,3,8,1,9,8]; +} +def: InstRW<[HWWriteResGroup189], (instrs VGATHERDPSYrm, + VPGATHERDDYrm)>; + +def HWWriteResGroup190 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> { + let Latency = 23; + let NumMicroOps = 14; + let ResourceCycles = [3,3,2,1,3,2]; +} +def: InstRW<[HWWriteResGroup190], (instrs VGATHERQPDrm, + VPGATHERQQrm)>; + +def HWWriteResGroup191 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> { + let Latency = 28; + let NumMicroOps = 15; + let ResourceCycles = [3,3,2,1,4,2]; +} +def: InstRW<[HWWriteResGroup191], (instrs VGATHERQPSYrm)>; + +def HWWriteResGroup192 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> { + let Latency = 25; + let NumMicroOps = 15; + let ResourceCycles = [3,3,2,1,4,2]; +} +def: InstRW<[HWWriteResGroup192], (instrs VGATHERQPSrm, + VGATHERDPSrm)>; + +def: InstRW<[WriteZero], (instrs CLC)>; + +} // SchedModel diff --git a/capstone/suite/synctools/tablegen/X86/back/X86SchedPredicates.td b/capstone/suite/synctools/tablegen/X86/back/X86SchedPredicates.td new file mode 100644 index 000000000..27aaeb193 --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86SchedPredicates.td @@ -0,0 +1,49 @@ +//===-- X86SchedPredicates.td - X86 Scheduling Predicates --*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines scheduling predicate definitions that are common to +// all X86 subtargets. +// +//===----------------------------------------------------------------------===// + +// A predicate used to identify dependency-breaking instructions that clear the +// content of the destination register. Note that this predicate only checks if +// input registers are the same. This predicate doesn't make any assumptions on +// the expected instruction opcodes, because different processors may implement +// different zero-idioms. +def ZeroIdiomPredicate : CheckSameRegOperand<1, 2>; + +// A predicate used to check if an instruction is a LEA, and if it uses all +// three source operands: base, index, and offset. +def IsThreeOperandsLEAPredicate: CheckAll<[ + CheckOpcode<[LEA32r, LEA64r, LEA64_32r, LEA16r]>, + + // isRegOperand(Base) + CheckIsRegOperand<1>, + CheckNot<CheckInvalidRegOperand<1>>, + + // isRegOperand(Index) + CheckIsRegOperand<3>, + CheckNot<CheckInvalidRegOperand<3>>, + + // hasLEAOffset(Offset) + CheckAny<[ + CheckAll<[ + CheckIsImmOperand<4>, + CheckNot<CheckZeroOperand<4>> + ]>, + CheckNonPortable<"MI.getOperand(4).isGlobal()"> + ]> +]>; + +// This predicate evaluates to true only if the input machine instruction is a +// 3-operands LEA. Tablegen automatically generates a new method for it in +// X86GenInstrInfo. +def IsThreeOperandsLEAFn : + TIIPredicate<"X86", "isThreeOperandsLEA", IsThreeOperandsLEAPredicate>; diff --git a/capstone/suite/synctools/tablegen/X86/back/X86SchedSandyBridge.td b/capstone/suite/synctools/tablegen/X86/back/X86SchedSandyBridge.td new file mode 100644 index 000000000..6b7bbdea8 --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86SchedSandyBridge.td @@ -0,0 +1,1159 @@ +//=- X86SchedSandyBridge.td - X86 Sandy Bridge Scheduling ----*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for Sandy Bridge to support instruction +// scheduling and other instruction cost heuristics. +// +// Note that we define some instructions here that are not supported by SNB, +// but we still have to define them because SNB is the default subtarget for +// X86. These instructions are tagged with a comment `Unsupported = 1`. +// +//===----------------------------------------------------------------------===// + +def SandyBridgeModel : SchedMachineModel { + // All x86 instructions are modeled as a single micro-op, and SB can decode 4 + // instructions per cycle. + // FIXME: Identify instructions that aren't a single fused micro-op. + let IssueWidth = 4; + let MicroOpBufferSize = 168; // Based on the reorder buffer. + let LoadLatency = 5; + let MispredictPenalty = 16; + + // Based on the LSD (loop-stream detector) queue size. + let LoopMicroOpBufferSize = 28; + + // This flag is set to allow the scheduler to assign + // a default model to unrecognized opcodes. + let CompleteModel = 0; +} + +let SchedModel = SandyBridgeModel in { + +// Sandy Bridge can issue micro-ops to 6 different ports in one cycle. + +// Ports 0, 1, and 5 handle all computation. +def SBPort0 : ProcResource<1>; +def SBPort1 : ProcResource<1>; +def SBPort5 : ProcResource<1>; + +// Ports 2 and 3 are identical. They handle loads and the address half of +// stores. +def SBPort23 : ProcResource<2>; + +// Port 4 gets the data half of stores. Store data can be available later than +// the store address, but since we don't model the latency of stores, we can +// ignore that. +def SBPort4 : ProcResource<1>; + +// Many micro-ops are capable of issuing on multiple ports. +def SBPort01 : ProcResGroup<[SBPort0, SBPort1]>; +def SBPort05 : ProcResGroup<[SBPort0, SBPort5]>; +def SBPort15 : ProcResGroup<[SBPort1, SBPort5]>; +def SBPort015 : ProcResGroup<[SBPort0, SBPort1, SBPort5]>; + +// 54 Entry Unified Scheduler +def SBPortAny : ProcResGroup<[SBPort0, SBPort1, SBPort23, SBPort4, SBPort5]> { + let BufferSize=54; +} + +// Integer division issued on port 0. +def SBDivider : ProcResource<1>; +// FP division and sqrt on port 0. +def SBFPDivider : ProcResource<1>; + +// Loads are 5 cycles, so ReadAfterLd registers needn't be available until 5 +// cycles after the memory operand. +def : ReadAdvance<ReadAfterLd, 5>; + +// Many SchedWrites are defined in pairs with and without a folded load. +// Instructions with folded loads are usually micro-fused, so they only appear +// as two micro-ops when queued in the reservation station. +// This multiclass defines the resource usage for variants with and without +// folded loads. +multiclass SBWriteResPair<X86FoldableSchedWrite SchedRW, + list<ProcResourceKind> ExePorts, + int Lat, list<int> Res = [1], int UOps = 1, + int LoadLat = 5> { + // Register variant is using a single cycle on ExePort. + def : WriteRes<SchedRW, ExePorts> { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } + + // Memory variant also uses a cycle on port 2/3 and adds LoadLat cycles to + // the latency (default = 5). + def : WriteRes<SchedRW.Folded, !listconcat([SBPort23], ExePorts)> { + let Latency = !add(Lat, LoadLat); + let ResourceCycles = !listconcat([1], Res); + let NumMicroOps = !add(UOps, 1); + } +} + +// A folded store needs a cycle on port 4 for the store data, and an extra port +// 2/3 cycle to recompute the address. +def : WriteRes<WriteRMW, [SBPort23,SBPort4]>; + +def : WriteRes<WriteStore, [SBPort23, SBPort4]>; +def : WriteRes<WriteStoreNT, [SBPort23, SBPort4]>; +def : WriteRes<WriteLoad, [SBPort23]> { let Latency = 5; } +def : WriteRes<WriteMove, [SBPort015]>; +def : WriteRes<WriteZero, []>; + +// Arithmetic. +defm : SBWriteResPair<WriteALU, [SBPort015], 1>; +defm : SBWriteResPair<WriteADC, [SBPort05,SBPort015], 2, [1,1], 2>; +defm : SBWriteResPair<WriteIMul, [SBPort1], 3>; +defm : SBWriteResPair<WriteIMul64, [SBPort1], 3>; + +defm : X86WriteRes<WriteBSWAP32, [SBPort1], 1, [1], 1>; +defm : X86WriteRes<WriteBSWAP64, [SBPort1,SBPort05], 2, [1,1], 2>; + +defm : SBWriteResPair<WriteDiv8, [SBPort0, SBDivider], 25, [1, 10]>; +defm : SBWriteResPair<WriteDiv16, [SBPort0, SBDivider], 25, [1, 10]>; +defm : SBWriteResPair<WriteDiv32, [SBPort0, SBDivider], 25, [1, 10]>; +defm : SBWriteResPair<WriteDiv64, [SBPort0, SBDivider], 25, [1, 10]>; +defm : SBWriteResPair<WriteIDiv8, [SBPort0, SBDivider], 25, [1, 10]>; +defm : SBWriteResPair<WriteIDiv16, [SBPort0, SBDivider], 25, [1, 10]>; +defm : SBWriteResPair<WriteIDiv32, [SBPort0, SBDivider], 25, [1, 10]>; +defm : SBWriteResPair<WriteIDiv64, [SBPort0, SBDivider], 25, [1, 10]>; + +def : WriteRes<WriteIMulH, []> { let Latency = 3; } + +// SHLD/SHRD. +defm : X86WriteRes<WriteSHDrri, [SBPort05, SBPort015], 2, [1, 1], 2>; +defm : X86WriteRes<WriteSHDrrcl,[SBPort05, SBPort015], 4, [3, 1], 4>; +defm : X86WriteRes<WriteSHDmri, [SBPort4,SBPort23,SBPort05,SBPort015], 8, [1, 2, 1, 1], 5>; +defm : X86WriteRes<WriteSHDmrcl,[SBPort4,SBPort23,SBPort05,SBPort015], 10, [1, 2, 3, 1], 7>; + +defm : SBWriteResPair<WriteShift, [SBPort05], 1>; +defm : SBWriteResPair<WriteJump, [SBPort5], 1>; +defm : SBWriteResPair<WriteCRC32, [SBPort1], 3, [1], 1, 5>; + +defm : SBWriteResPair<WriteCMOV, [SBPort05,SBPort015], 2, [1,1], 2>; // Conditional move. +defm : SBWriteResPair<WriteCMOV2, [SBPort05,SBPort015], 3, [2,1], 3>; // Conditional (CF + ZF flag) move. +defm : X86WriteRes<WriteFCMOV, [SBPort5,SBPort05], 3, [2,1], 3>; // x87 conditional move. +def : WriteRes<WriteSETCC, [SBPort05]>; // Setcc. +def : WriteRes<WriteSETCCStore, [SBPort05,SBPort4,SBPort23]> { + let Latency = 2; + let NumMicroOps = 3; +} +def : WriteRes<WriteLAHFSAHF, [SBPort05]>; +def : WriteRes<WriteBitTest,[SBPort05]>; + +// This is for simple LEAs with one or two input operands. +// The complex ones can only execute on port 1, and they require two cycles on +// the port to read all inputs. We don't model that. +def : WriteRes<WriteLEA, [SBPort01]>; + +// Bit counts. +defm : SBWriteResPair<WriteBSF, [SBPort1], 3, [1], 1, 5>; +defm : SBWriteResPair<WriteBSR, [SBPort1], 3, [1], 1, 5>; +defm : SBWriteResPair<WriteLZCNT, [SBPort1], 3, [1], 1, 5>; +defm : SBWriteResPair<WriteTZCNT, [SBPort1], 3, [1], 1, 5>; +defm : SBWriteResPair<WritePOPCNT, [SBPort1], 3, [1], 1, 6>; + +// BMI1 BEXTR, BMI2 BZHI +// NOTE: These don't exist on Sandy Bridge. Ports are guesses. +defm : SBWriteResPair<WriteBEXTR, [SBPort05,SBPort1], 2, [1,1], 2>; +defm : SBWriteResPair<WriteBZHI, [SBPort1], 1>; + +// Scalar and vector floating point. +defm : X86WriteRes<WriteFLD0, [SBPort5], 1, [1], 1>; +defm : X86WriteRes<WriteFLD1, [SBPort0,SBPort5], 1, [1,1], 2>; +defm : X86WriteRes<WriteFLDC, [SBPort0,SBPort1], 1, [1,1], 2>; +defm : X86WriteRes<WriteFLoad, [SBPort23], 5, [1], 1>; +defm : X86WriteRes<WriteFLoadX, [SBPort23], 6, [1], 1>; +defm : X86WriteRes<WriteFLoadY, [SBPort23], 7, [1], 1>; +defm : X86WriteRes<WriteFMaskedLoad, [SBPort23,SBPort05], 8, [1,2], 3>; +defm : X86WriteRes<WriteFMaskedLoadY, [SBPort23,SBPort05], 9, [1,2], 3>; +defm : X86WriteRes<WriteFStore, [SBPort23,SBPort4], 1, [1,1], 1>; +defm : X86WriteRes<WriteFStoreX, [SBPort23,SBPort4], 1, [1,1], 1>; +defm : X86WriteRes<WriteFStoreY, [SBPort23,SBPort4], 1, [1,1], 1>; +defm : X86WriteRes<WriteFStoreNT, [SBPort23,SBPort4], 1, [1,1], 1>; +defm : X86WriteRes<WriteFStoreNTX, [SBPort23,SBPort4], 1, [1,1], 1>; +defm : X86WriteRes<WriteFStoreNTY, [SBPort23,SBPort4], 1, [1,1], 1>; +defm : X86WriteRes<WriteFMaskedStore, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>; +defm : X86WriteRes<WriteFMaskedStoreY, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>; +defm : X86WriteRes<WriteFMove, [SBPort5], 1, [1], 1>; +defm : X86WriteRes<WriteFMoveX, [SBPort5], 1, [1], 1>; +defm : X86WriteRes<WriteFMoveY, [SBPort5], 1, [1], 1>; +defm : X86WriteRes<WriteEMMS, [SBPort015], 31, [31], 31>; + +defm : SBWriteResPair<WriteFAdd, [SBPort1], 3, [1], 1, 6>; +defm : SBWriteResPair<WriteFAddX, [SBPort1], 3, [1], 1, 6>; +defm : SBWriteResPair<WriteFAddY, [SBPort1], 3, [1], 1, 7>; +defm : SBWriteResPair<WriteFAddZ, [SBPort1], 3, [1], 1, 7>; // Unsupported = 1 +defm : SBWriteResPair<WriteFAdd64, [SBPort1], 3, [1], 1, 6>; +defm : SBWriteResPair<WriteFAdd64X, [SBPort1], 3, [1], 1, 6>; +defm : SBWriteResPair<WriteFAdd64Y, [SBPort1], 3, [1], 1, 7>; +defm : SBWriteResPair<WriteFAdd64Z, [SBPort1], 3, [1], 1, 7>; // Unsupported = 1 + +defm : SBWriteResPair<WriteFCmp, [SBPort1], 3, [1], 1, 6>; +defm : SBWriteResPair<WriteFCmpX, [SBPort1], 3, [1], 1, 6>; +defm : SBWriteResPair<WriteFCmpY, [SBPort1], 3, [1], 1, 7>; +defm : SBWriteResPair<WriteFCmpZ, [SBPort1], 3, [1], 1, 7>; // Unsupported = 1 +defm : SBWriteResPair<WriteFCmp64, [SBPort1], 3, [1], 1, 6>; +defm : SBWriteResPair<WriteFCmp64X, [SBPort1], 3, [1], 1, 6>; +defm : SBWriteResPair<WriteFCmp64Y, [SBPort1], 3, [1], 1, 7>; +defm : SBWriteResPair<WriteFCmp64Z, [SBPort1], 3, [1], 1, 7>; // Unsupported = 1 + +defm : SBWriteResPair<WriteFCom, [SBPort1], 3>; + +defm : SBWriteResPair<WriteFMul, [SBPort0], 5, [1], 1, 6>; +defm : SBWriteResPair<WriteFMulX, [SBPort0], 5, [1], 1, 6>; +defm : SBWriteResPair<WriteFMulY, [SBPort0], 5, [1], 1, 7>; +defm : SBWriteResPair<WriteFMulZ, [SBPort0], 5, [1], 1, 7>; // Unsupported = 1 +defm : SBWriteResPair<WriteFMul64, [SBPort0], 5, [1], 1, 6>; +defm : SBWriteResPair<WriteFMul64X, [SBPort0], 5, [1], 1, 6>; +defm : SBWriteResPair<WriteFMul64Y, [SBPort0], 5, [1], 1, 7>; +defm : SBWriteResPair<WriteFMul64Z, [SBPort0], 5, [1], 1, 7>; // Unsupported = 1 + +defm : SBWriteResPair<WriteFDiv, [SBPort0,SBFPDivider], 14, [1,14], 1, 6>; +defm : SBWriteResPair<WriteFDivX, [SBPort0,SBFPDivider], 14, [1,14], 1, 6>; +defm : SBWriteResPair<WriteFDivY, [SBPort0,SBPort05,SBFPDivider], 29, [2,1,28], 3, 7>; +defm : SBWriteResPair<WriteFDivZ, [SBPort0,SBPort05,SBFPDivider], 29, [2,1,28], 3, 7>; // Unsupported = 1 +defm : SBWriteResPair<WriteFDiv64, [SBPort0,SBFPDivider], 22, [1,22], 1, 6>; +defm : SBWriteResPair<WriteFDiv64X, [SBPort0,SBFPDivider], 22, [1,22], 1, 6>; +defm : SBWriteResPair<WriteFDiv64Y, [SBPort0,SBPort05,SBFPDivider], 45, [2,1,44], 3, 7>; +defm : SBWriteResPair<WriteFDiv64Z, [SBPort0,SBPort05,SBFPDivider], 45, [2,1,44], 3, 7>; // Unsupported = 1 + +defm : SBWriteResPair<WriteFRcp, [SBPort0], 5, [1], 1, 6>; +defm : SBWriteResPair<WriteFRcpX, [SBPort0], 5, [1], 1, 6>; +defm : SBWriteResPair<WriteFRcpY, [SBPort0,SBPort05], 7, [2,1], 3, 7>; +defm : SBWriteResPair<WriteFRcpZ, [SBPort0,SBPort05], 7, [2,1], 3, 7>; // Unsupported = 1 + +defm : SBWriteResPair<WriteFRsqrt, [SBPort0], 5, [1], 1, 6>; +defm : SBWriteResPair<WriteFRsqrtX,[SBPort0], 5, [1], 1, 6>; +defm : SBWriteResPair<WriteFRsqrtY,[SBPort0,SBPort05], 7, [2,1], 3, 7>; +defm : SBWriteResPair<WriteFRsqrtZ,[SBPort0,SBPort05], 7, [2,1], 3, 7>; // Unsupported = 1 + +defm : SBWriteResPair<WriteFSqrt, [SBPort0,SBFPDivider], 14, [1,14], 1, 6>; +defm : SBWriteResPair<WriteFSqrtX, [SBPort0,SBFPDivider], 14, [1,14], 1, 6>; +defm : SBWriteResPair<WriteFSqrtY, [SBPort0,SBPort05,SBFPDivider], 29, [2,1,28], 3, 7>; +defm : SBWriteResPair<WriteFSqrtZ, [SBPort0,SBPort05,SBFPDivider], 29, [2,1,28], 3, 7>; // Unsupported = 1 +defm : SBWriteResPair<WriteFSqrt64, [SBPort0,SBFPDivider], 21, [1,21], 1, 6>; +defm : SBWriteResPair<WriteFSqrt64X, [SBPort0,SBFPDivider], 21, [1,21], 1, 6>; +defm : SBWriteResPair<WriteFSqrt64Y, [SBPort0,SBPort05,SBFPDivider], 45, [2,1,44], 3, 7>; +defm : SBWriteResPair<WriteFSqrt64Z, [SBPort0,SBPort05,SBFPDivider], 45, [2,1,44], 3, 7>; // Unsupported = 1 +defm : SBWriteResPair<WriteFSqrt80, [SBPort0,SBFPDivider], 24, [1,24], 1, 6>; + +defm : SBWriteResPair<WriteDPPD, [SBPort0,SBPort1,SBPort5], 9, [1,1,1], 3, 6>; +defm : SBWriteResPair<WriteDPPS, [SBPort0,SBPort1,SBPort5], 12, [1,2,1], 4, 6>; +defm : SBWriteResPair<WriteDPPSY, [SBPort0,SBPort1,SBPort5], 12, [1,2,1], 4, 7>; +defm : SBWriteResPair<WriteDPPSZ, [SBPort0,SBPort1,SBPort5], 12, [1,2,1], 4, 7>; // Unsupported = 1 +defm : SBWriteResPair<WriteFSign, [SBPort5], 1>; +defm : SBWriteResPair<WriteFRnd, [SBPort1], 3, [1], 1, 6>; +defm : SBWriteResPair<WriteFRndY, [SBPort1], 3, [1], 1, 7>; +defm : SBWriteResPair<WriteFRndZ, [SBPort1], 3, [1], 1, 7>; // Unsupported = 1 +defm : SBWriteResPair<WriteFLogic, [SBPort5], 1, [1], 1, 6>; +defm : SBWriteResPair<WriteFLogicY, [SBPort5], 1, [1], 1, 7>; +defm : SBWriteResPair<WriteFLogicZ, [SBPort5], 1, [1], 1, 7>; // Unsupported = 1 +defm : SBWriteResPair<WriteFTest, [SBPort0], 1, [1], 1, 6>; +defm : SBWriteResPair<WriteFTestY, [SBPort0], 1, [1], 1, 7>; +defm : SBWriteResPair<WriteFTestZ, [SBPort0], 1, [1], 1, 7>; // Unsupported = 1 +defm : SBWriteResPair<WriteFShuffle, [SBPort5], 1, [1], 1, 6>; +defm : SBWriteResPair<WriteFShuffleY,[SBPort5], 1, [1], 1, 7>; +defm : SBWriteResPair<WriteFShuffleZ,[SBPort5], 1, [1], 1, 7>; // Unsupported = 1 +defm : SBWriteResPair<WriteFVarShuffle, [SBPort5], 1, [1], 1, 6>; +defm : SBWriteResPair<WriteFVarShuffleY,[SBPort5], 1, [1], 1, 7>; +defm : SBWriteResPair<WriteFVarShuffleZ,[SBPort5], 1, [1], 1, 7>; // Unsupported = 1 +defm : SBWriteResPair<WriteFBlend, [SBPort05], 1, [1], 1, 6>; +defm : SBWriteResPair<WriteFBlendY, [SBPort05], 1, [1], 1, 7>; +defm : SBWriteResPair<WriteFBlendZ, [SBPort05], 1, [1], 1, 7>; // Unsupported = 1 +defm : SBWriteResPair<WriteFVarBlend, [SBPort05], 2, [2], 2, 6>; +defm : SBWriteResPair<WriteFVarBlendY,[SBPort05], 2, [2], 2, 7>; +defm : SBWriteResPair<WriteFVarBlendZ,[SBPort05], 2, [2], 2, 7>; // Unsupported = 1 + +// Conversion between integer and float. +defm : SBWriteResPair<WriteCvtSS2I, [SBPort0,SBPort1], 5, [1,1], 2>; +defm : SBWriteResPair<WriteCvtPS2I, [SBPort1], 3, [1], 1, 6>; +defm : SBWriteResPair<WriteCvtPS2IY, [SBPort1], 3, [1], 1, 7>; +defm : SBWriteResPair<WriteCvtPS2IZ, [SBPort1], 3, [1], 1, 7>; // Unsupported = 1 +defm : SBWriteResPair<WriteCvtSD2I, [SBPort0,SBPort1], 5, [1,1], 2>; +defm : SBWriteResPair<WriteCvtPD2I, [SBPort1,SBPort5], 4, [1,1], 2, 6>; +defm : X86WriteRes<WriteCvtPD2IY, [SBPort1,SBPort5], 4, [1,1], 2>; +defm : X86WriteRes<WriteCvtPD2IZ, [SBPort1,SBPort5], 4, [1,1], 2>; // Unsupported = 1 +defm : X86WriteRes<WriteCvtPD2IYLd, [SBPort1,SBPort5,SBPort23], 11, [1,1,1], 3>; +defm : X86WriteRes<WriteCvtPD2IZLd, [SBPort1,SBPort5,SBPort23], 11, [1,1,1], 3>; // Unsupported = 1 + +defm : X86WriteRes<WriteCvtI2SS, [SBPort1,SBPort5], 5, [1,2], 3>; +defm : X86WriteRes<WriteCvtI2SSLd, [SBPort1,SBPort5,SBPort23], 10, [1,1,1], 3>; +defm : SBWriteResPair<WriteCvtI2PS, [SBPort1], 3, [1], 1, 6>; +defm : SBWriteResPair<WriteCvtI2PSY, [SBPort1], 3, [1], 1, 7>; +defm : SBWriteResPair<WriteCvtI2PSZ, [SBPort1], 3, [1], 1, 7>; // Unsupported = 1 +defm : X86WriteRes<WriteCvtI2SD, [SBPort1,SBPort5], 4, [1,1], 2>; +defm : X86WriteRes<WriteCvtI2PD, [SBPort1,SBPort5], 4, [1,1], 2>; +defm : X86WriteRes<WriteCvtI2PDY, [SBPort1,SBPort5], 4, [1,1], 2>; +defm : X86WriteRes<WriteCvtI2PDZ, [SBPort1,SBPort5], 4, [1,1], 2>; // Unsupported = 1 +defm : X86WriteRes<WriteCvtI2SDLd, [SBPort1,SBPort23], 9, [1,1], 2>; +defm : X86WriteRes<WriteCvtI2PDLd, [SBPort1,SBPort5,SBPort23], 10, [1,1,1], 3>; +defm : X86WriteRes<WriteCvtI2PDYLd, [SBPort1,SBPort5,SBPort23], 10, [1,1,1], 3>; +defm : X86WriteRes<WriteCvtI2PDZLd, [SBPort1,SBPort5,SBPort23], 10, [1,1,1], 3>; // Unsupported = 1 + +defm : SBWriteResPair<WriteCvtSS2SD, [SBPort0], 1, [1], 1, 6>; +defm : X86WriteRes<WriteCvtPS2PD, [SBPort0,SBPort5], 2, [1,1], 2>; +defm : X86WriteRes<WriteCvtPS2PDY, [SBPort0,SBPort5], 2, [1,1], 2>; +defm : X86WriteRes<WriteCvtPS2PDZ, [SBPort0,SBPort5], 2, [1,1], 2>; // Unsupported = 1 +defm : X86WriteRes<WriteCvtPS2PDLd, [SBPort0,SBPort23], 7, [1,1], 2>; +defm : X86WriteRes<WriteCvtPS2PDYLd, [SBPort0,SBPort23], 7, [1,1], 2>; +defm : X86WriteRes<WriteCvtPS2PDZLd, [SBPort0,SBPort23], 7, [1,1], 2>; // Unsupported = 1 +defm : SBWriteResPair<WriteCvtSD2SS, [SBPort1,SBPort5], 4, [1,1], 2, 6>; +defm : SBWriteResPair<WriteCvtPD2PS, [SBPort1,SBPort5], 4, [1,1], 2, 6>; +defm : SBWriteResPair<WriteCvtPD2PSY, [SBPort1,SBPort5], 4, [1,1], 2, 7>; +defm : SBWriteResPair<WriteCvtPD2PSZ, [SBPort1,SBPort5], 4, [1,1], 2, 7>; // Unsupported = 1 + +defm : SBWriteResPair<WriteCvtPH2PS, [SBPort1], 3>; +defm : SBWriteResPair<WriteCvtPH2PSY, [SBPort1], 3>; +defm : SBWriteResPair<WriteCvtPH2PSZ, [SBPort1], 3>; // Unsupported = 1 + +defm : X86WriteRes<WriteCvtPS2PH, [SBPort1], 3, [1], 1>; +defm : X86WriteRes<WriteCvtPS2PHY, [SBPort1], 3, [1], 1>; +defm : X86WriteRes<WriteCvtPS2PHZ, [SBPort1], 3, [1], 1>; // Unsupported = 1 +defm : X86WriteRes<WriteCvtPS2PHSt, [SBPort1, SBPort23, SBPort4], 4, [1,1,1], 1>; +defm : X86WriteRes<WriteCvtPS2PHYSt, [SBPort1, SBPort23, SBPort4], 4, [1,1,1], 1>; +defm : X86WriteRes<WriteCvtPS2PHZSt, [SBPort1, SBPort23, SBPort4], 4, [1,1,1], 1>; // Unsupported = 1 + +// Vector integer operations. +defm : X86WriteRes<WriteVecLoad, [SBPort23], 5, [1], 1>; +defm : X86WriteRes<WriteVecLoadX, [SBPort23], 6, [1], 1>; +defm : X86WriteRes<WriteVecLoadY, [SBPort23], 7, [1], 1>; +defm : X86WriteRes<WriteVecLoadNT, [SBPort23], 6, [1], 1>; +defm : X86WriteRes<WriteVecLoadNTY, [SBPort23], 7, [1], 1>; +defm : X86WriteRes<WriteVecMaskedLoad, [SBPort23,SBPort05], 8, [1,2], 3>; +defm : X86WriteRes<WriteVecMaskedLoadY, [SBPort23,SBPort05], 9, [1,2], 3>; +defm : X86WriteRes<WriteVecStore, [SBPort23,SBPort4], 1, [1,1], 1>; +defm : X86WriteRes<WriteVecStoreX, [SBPort23,SBPort4], 1, [1,1], 1>; +defm : X86WriteRes<WriteVecStoreY, [SBPort23,SBPort4], 1, [1,1], 1>; +defm : X86WriteRes<WriteVecStoreNT, [SBPort23,SBPort4], 1, [1,1], 1>; +defm : X86WriteRes<WriteVecStoreNTY, [SBPort23,SBPort4], 1, [1,1], 1>; +defm : X86WriteRes<WriteVecMaskedStore, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>; +defm : X86WriteRes<WriteVecMaskedStoreY, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>; +defm : X86WriteRes<WriteVecMove, [SBPort05], 1, [1], 1>; +defm : X86WriteRes<WriteVecMoveX, [SBPort015], 1, [1], 1>; +defm : X86WriteRes<WriteVecMoveY, [SBPort05], 1, [1], 1>; +defm : X86WriteRes<WriteVecMoveToGpr, [SBPort0], 2, [1], 1>; +defm : X86WriteRes<WriteVecMoveFromGpr, [SBPort5], 1, [1], 1>; + +defm : SBWriteResPair<WriteVecLogic, [SBPort015], 1, [1], 1, 5>; +defm : SBWriteResPair<WriteVecLogicX,[SBPort015], 1, [1], 1, 6>; +defm : SBWriteResPair<WriteVecLogicY,[SBPort015], 1, [1], 1, 7>; +defm : SBWriteResPair<WriteVecLogicZ,[SBPort015], 1, [1], 1, 7>; // Unsupported = 1 +defm : SBWriteResPair<WriteVecTest, [SBPort0,SBPort5], 2, [1,1], 2, 6>; +defm : SBWriteResPair<WriteVecTestY, [SBPort0,SBPort5], 2, [1,1], 2, 7>; +defm : SBWriteResPair<WriteVecTestZ, [SBPort0,SBPort5], 2, [1,1], 2, 7>; // Unsupported = 1 +defm : SBWriteResPair<WriteVecALU, [SBPort1], 3, [1], 1, 5>; +defm : SBWriteResPair<WriteVecALUX, [SBPort15], 1, [1], 1, 6>; +defm : SBWriteResPair<WriteVecALUY, [SBPort15], 1, [1], 1, 7>; +defm : SBWriteResPair<WriteVecALUZ, [SBPort15], 1, [1], 1, 7>; // Unsupported = 1 +defm : SBWriteResPair<WriteVecIMul, [SBPort0], 5, [1], 1, 5>; +defm : SBWriteResPair<WriteVecIMulX, [SBPort0], 5, [1], 1, 6>; +defm : SBWriteResPair<WriteVecIMulY, [SBPort0], 5, [1], 1, 7>; +defm : SBWriteResPair<WriteVecIMulZ, [SBPort0], 5, [1], 1, 7>; // Unsupported = 1 +defm : SBWriteResPair<WritePMULLD, [SBPort0], 5, [1], 1, 6>; +defm : SBWriteResPair<WritePMULLDY, [SBPort0], 5, [1], 1, 7>; // TODO this is probably wrong for 256/512-bit for the "generic" model +defm : SBWriteResPair<WritePMULLDZ, [SBPort0], 5, [1], 1, 7>; // Unsupported = 1 +defm : SBWriteResPair<WriteShuffle, [SBPort5], 1, [1], 1, 5>; +defm : SBWriteResPair<WriteShuffleX, [SBPort15], 1, [1], 1, 6>; +defm : SBWriteResPair<WriteShuffleY, [SBPort5], 1, [1], 1, 7>; +defm : SBWriteResPair<WriteShuffleZ, [SBPort5], 1, [1], 1, 7>; // Unsupported = 1 +defm : SBWriteResPair<WriteVarShuffle, [SBPort15], 1, [1], 1, 5>; +defm : SBWriteResPair<WriteVarShuffleX, [SBPort15], 1, [1], 1, 6>; +defm : SBWriteResPair<WriteVarShuffleY, [SBPort15], 1, [1], 1, 7>; +defm : SBWriteResPair<WriteVarShuffleZ, [SBPort15], 1, [1], 1, 7>; // Unsupported = 1 +defm : SBWriteResPair<WriteBlend, [SBPort15], 1, [1], 1, 6>; +defm : SBWriteResPair<WriteBlendY, [SBPort15], 1, [1], 1, 7>; +defm : SBWriteResPair<WriteBlendZ, [SBPort15], 1, [1], 1, 7>; // Unsupported = 1 +defm : SBWriteResPair<WriteVarBlend, [SBPort15], 2, [2], 2, 6>; +defm : SBWriteResPair<WriteVarBlendY,[SBPort15], 2, [2], 2, 7>; +defm : SBWriteResPair<WriteVarBlendZ,[SBPort15], 2, [2], 2, 7>; // Unsupported = 1 +defm : SBWriteResPair<WriteMPSAD, [SBPort0, SBPort15], 7, [1,2], 3, 6>; +defm : SBWriteResPair<WriteMPSADY, [SBPort0, SBPort15], 7, [1,2], 3, 7>; +defm : SBWriteResPair<WriteMPSADZ, [SBPort0, SBPort15], 7, [1,2], 3, 7>; // Unsupported = 1 +defm : SBWriteResPair<WritePSADBW, [SBPort0], 5, [1], 1, 5>; +defm : SBWriteResPair<WritePSADBWX, [SBPort0], 5, [1], 1, 6>; +defm : SBWriteResPair<WritePSADBWY, [SBPort0], 5, [1], 1, 7>; +defm : SBWriteResPair<WritePSADBWZ, [SBPort0], 5, [1], 1, 7>; // Unsupported = 1 +defm : SBWriteResPair<WritePHMINPOS, [SBPort0], 5, [1], 1, 6>; + +// Vector integer shifts. +defm : SBWriteResPair<WriteVecShift, [SBPort5], 1, [1], 1, 5>; +defm : SBWriteResPair<WriteVecShiftX, [SBPort0,SBPort15], 2, [1,1], 2, 6>; +defm : SBWriteResPair<WriteVecShiftY, [SBPort0,SBPort15], 4, [1,1], 2, 7>; +defm : SBWriteResPair<WriteVecShiftZ, [SBPort0,SBPort15], 4, [1,1], 2, 7>; // Unsupported = 1 +defm : SBWriteResPair<WriteVecShiftImm, [SBPort5], 1, [1], 1, 5>; +defm : SBWriteResPair<WriteVecShiftImmX, [SBPort0], 1, [1], 1, 6>; +defm : SBWriteResPair<WriteVecShiftImmY, [SBPort0], 1, [1], 1, 7>; +defm : SBWriteResPair<WriteVecShiftImmZ, [SBPort0], 1, [1], 1, 7>; // Unsupported = 1 +defm : SBWriteResPair<WriteVarVecShift, [SBPort0], 1, [1], 1, 6>; +defm : SBWriteResPair<WriteVarVecShiftY, [SBPort0], 1, [1], 1, 7>; +defm : SBWriteResPair<WriteVarVecShiftZ, [SBPort0], 1, [1], 1, 7>; // Unsupported = 1 + +// Vector insert/extract operations. +def : WriteRes<WriteVecInsert, [SBPort5,SBPort15]> { + let Latency = 2; + let NumMicroOps = 2; +} +def : WriteRes<WriteVecInsertLd, [SBPort23,SBPort15]> { + let Latency = 7; + let NumMicroOps = 2; +} + +def : WriteRes<WriteVecExtract, [SBPort0,SBPort15]> { + let Latency = 3; + let NumMicroOps = 2; +} +def : WriteRes<WriteVecExtractSt, [SBPort4,SBPort23,SBPort15]> { + let Latency = 5; + let NumMicroOps = 3; +} + +//////////////////////////////////////////////////////////////////////////////// +// Horizontal add/sub instructions. +//////////////////////////////////////////////////////////////////////////////// + +defm : SBWriteResPair<WriteFHAdd, [SBPort1,SBPort5], 5, [1,2], 3, 6>; +defm : SBWriteResPair<WriteFHAddY, [SBPort1,SBPort5], 5, [1,2], 3, 7>; +defm : SBWriteResPair<WriteFHAddZ, [SBPort1,SBPort5], 5, [1,2], 3, 7>; // Unsupported = 1 +defm : SBWriteResPair<WritePHAdd, [SBPort15], 3, [3], 3, 5>; +defm : SBWriteResPair<WritePHAddX, [SBPort15], 3, [3], 3, 6>; +defm : SBWriteResPair<WritePHAddY, [SBPort15], 3, [3], 3, 7>; +defm : SBWriteResPair<WritePHAddZ, [SBPort15], 3, [3], 3, 7>; // Unsupported = 1 + +//////////////////////////////////////////////////////////////////////////////// +// String instructions. +//////////////////////////////////////////////////////////////////////////////// + +// Packed Compare Implicit Length Strings, Return Mask +def : WriteRes<WritePCmpIStrM, [SBPort0]> { + let Latency = 11; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def : WriteRes<WritePCmpIStrMLd, [SBPort0, SBPort23]> { + let Latency = 17; + let NumMicroOps = 4; + let ResourceCycles = [3,1]; +} + +// Packed Compare Explicit Length Strings, Return Mask +def : WriteRes<WritePCmpEStrM, [SBPort015]> { + let Latency = 11; + let ResourceCycles = [8]; +} +def : WriteRes<WritePCmpEStrMLd, [SBPort015, SBPort23]> { + let Latency = 11; + let ResourceCycles = [7, 1]; +} + +// Packed Compare Implicit Length Strings, Return Index +def : WriteRes<WritePCmpIStrI, [SBPort0]> { + let Latency = 11; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def : WriteRes<WritePCmpIStrILd, [SBPort0,SBPort23]> { + let Latency = 17; + let NumMicroOps = 4; + let ResourceCycles = [3,1]; +} + +// Packed Compare Explicit Length Strings, Return Index +def : WriteRes<WritePCmpEStrI, [SBPort015]> { + let Latency = 4; + let ResourceCycles = [8]; +} +def : WriteRes<WritePCmpEStrILd, [SBPort015, SBPort23]> { + let Latency = 4; + let ResourceCycles = [7, 1]; +} + +// MOVMSK Instructions. +def : WriteRes<WriteFMOVMSK, [SBPort0]> { let Latency = 2; } +def : WriteRes<WriteVecMOVMSK, [SBPort0]> { let Latency = 2; } +def : WriteRes<WriteVecMOVMSKY, [SBPort0]> { let Latency = 2; } +def : WriteRes<WriteMMXMOVMSK, [SBPort0]> { let Latency = 1; } + +// AES Instructions. +def : WriteRes<WriteAESDecEnc, [SBPort5,SBPort015]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def : WriteRes<WriteAESDecEncLd, [SBPort5,SBPort23,SBPort015]> { + let Latency = 13; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} + +def : WriteRes<WriteAESIMC, [SBPort5]> { + let Latency = 12; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def : WriteRes<WriteAESIMCLd, [SBPort5,SBPort23]> { + let Latency = 18; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} + +def : WriteRes<WriteAESKeyGen, [SBPort015]> { + let Latency = 8; + let ResourceCycles = [11]; +} +def : WriteRes<WriteAESKeyGenLd, [SBPort015, SBPort23]> { + let Latency = 8; + let ResourceCycles = [10, 1]; +} + +// Carry-less multiplication instructions. +def : WriteRes<WriteCLMul, [SBPort015]> { + let Latency = 14; + let ResourceCycles = [18]; +} +def : WriteRes<WriteCLMulLd, [SBPort015, SBPort23]> { + let Latency = 14; + let ResourceCycles = [17, 1]; +} + +// Load/store MXCSR. +// FIXME: This is probably wrong. Only STMXCSR should require Port4. +def : WriteRes<WriteLDMXCSR, [SBPort0,SBPort4,SBPort5,SBPort23]> { let Latency = 5; let NumMicroOps = 4; let ResourceCycles = [1,1,1,1]; } +def : WriteRes<WriteSTMXCSR, [SBPort0,SBPort4,SBPort5,SBPort23]> { let Latency = 5; let NumMicroOps = 4; let ResourceCycles = [1,1,1,1]; } + +def : WriteRes<WriteSystem, [SBPort015]> { let Latency = 100; } +def : WriteRes<WriteMicrocoded, [SBPort015]> { let Latency = 100; } +def : WriteRes<WriteFence, [SBPort23, SBPort4]>; +def : WriteRes<WriteNop, []>; + +// AVX2/FMA is not supported on that architecture, but we should define the basic +// scheduling resources anyway. +defm : SBWriteResPair<WriteFShuffle256, [SBPort5], 1, [1], 1, 7>; +defm : SBWriteResPair<WriteFVarShuffle256, [SBPort5], 1, [1], 1, 7>; +defm : SBWriteResPair<WriteShuffle256, [SBPort5], 1, [1], 1, 7>; +defm : SBWriteResPair<WriteVarShuffle256, [SBPort5], 1, [1], 1, 7>; +defm : SBWriteResPair<WriteFMA, [SBPort01], 5>; +defm : SBWriteResPair<WriteFMAX, [SBPort01], 5>; +defm : SBWriteResPair<WriteFMAY, [SBPort01], 5>; +defm : SBWriteResPair<WriteFMAZ, [SBPort01], 5>; // Unsupported = 1 + +// Remaining SNB instrs. + +def SBWriteResGroup1 : SchedWriteRes<[SBPort1]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup1], (instrs COMP_FST0r, + COM_FST0r, + UCOM_FPr, + UCOM_Fr)>; + +def SBWriteResGroup2 : SchedWriteRes<[SBPort5]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup2], (instrs FDECSTP, FINCSTP, FFREE, FFREEP, FNOP, + LD_Frr, ST_Frr, ST_FPrr)>; +def: InstRW<[SBWriteResGroup2], (instrs LOOP, LOOPE, LOOPNE)>; // FIXME: This seems wrong compared to other Intel CPUs. +def: InstRW<[SBWriteResGroup2], (instrs RETQ)>; + +def SBWriteResGroup4 : SchedWriteRes<[SBPort05]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup4], (instrs CDQ, CQO)>; + +def SBWriteResGroup5 : SchedWriteRes<[SBPort15]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup5], (instregex "MMX_PABS(B|D|W)rr", + "MMX_PADDQirr", + "MMX_PALIGNRrri", + "MMX_PSIGN(B|D|W)rr")>; + +def SBWriteResGroup9 : SchedWriteRes<[SBPort05]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SBWriteResGroup9], (instregex "ROL(8|16|32|64)r1", + "ROL(8|16|32|64)ri", + "ROR(8|16|32|64)r1", + "ROR(8|16|32|64)ri", + "SET(A|BE)r")>; + +def SBWriteResGroup11 : SchedWriteRes<[SBPort015]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SBWriteResGroup11], (instrs SCASB, + SCASL, + SCASQ, + SCASW)>; + +def SBWriteResGroup12 : SchedWriteRes<[SBPort0,SBPort1]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup12], (instregex "(V?)COMISDrr", + "(V?)COMISSrr", + "(V?)UCOMISDrr", + "(V?)UCOMISSrr")>; + +def SBWriteResGroup15 : SchedWriteRes<[SBPort0,SBPort015]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup15], (instrs CWD, + FNSTSW16r)>; + +def SBWriteResGroup18 : SchedWriteRes<[SBPort5,SBPort015]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup18], (instrs JCXZ, JECXZ, JRCXZ)>; +def: InstRW<[SBWriteResGroup18], (instregex "MMX_MOVDQ2Qrr")>; + +def SBWriteResGroup21 : SchedWriteRes<[SBPort1]> { + let Latency = 3; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup21], (instregex "PUSHFS64")>; + +def SBWriteResGroup21_16i : SchedWriteRes<[SBPort1, SBPort015]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup21_16i], (instrs IMUL16rri, IMUL16rri8)>; + +def SBWriteResGroup22 : SchedWriteRes<[SBPort0,SBPort5]> { + let Latency = 3; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup22], (instregex "(V?)EXTRACTPSrr")>; + +def SBWriteResGroup23_2 : SchedWriteRes<[SBPort05]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def: InstRW<[SBWriteResGroup23_2], (instregex "ROL(8|16|32|64)rCL", + "ROR(8|16|32|64)rCL", + "SAR(8|16|32|64)rCL", + "SHL(8|16|32|64)rCL", + "SHR(8|16|32|64)rCL")>; + +def SBWriteResGroup25 : SchedWriteRes<[SBPort015]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def: InstRW<[SBWriteResGroup25], (instrs XADD8rr, XADD16rr, XADD32rr, XADD64rr, + XCHG8rr, XCHG16rr, XCHG32rr, XCHG64rr, + XCHG16ar, XCHG32ar, XCHG64ar)>; + +def SBWriteResGroup25_1 : SchedWriteRes<[SBPort23,SBPort015]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SBWriteResGroup25_1], (instrs LEAVE, LEAVE64)>; + +def SBWriteResGroup26_2 : SchedWriteRes<[SBPort0,SBPort1,SBPort5]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup26_2], (instrs COM_FIPr, COM_FIr, UCOM_FIPr, UCOM_FIr)>; + +def SBWriteResGroup27 : SchedWriteRes<[SBPort0,SBPort1]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup27], (instrs IMUL64r, MUL64r)>; + +def SBWriteResGroup27_1 : SchedWriteRes<[SBPort1,SBPort05,SBPort015]> { + let Latency = 4; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup27_1], (instrs IMUL32r, MUL32r)>; + +def SBWriteResGroup27_2 : SchedWriteRes<[SBPort1,SBPort05,SBPort015]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; +} +def: InstRW<[SBWriteResGroup27_2], (instrs IMUL16r, MUL16r)>; + +def SBWriteResGroup29 : SchedWriteRes<[SBPort1,SBPort015]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup29], (instregex "MOV64sr")>; + +def SBWriteResGroup29_2 : SchedWriteRes<[SBPort5,SBPort015]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [1,3]; +} +def: InstRW<[SBWriteResGroup29_2], (instrs PAUSE)>; + +def SBWriteResGroup30 : SchedWriteRes<[SBPort0]> { + let Latency = 5; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup30], (instregex "(V?)PCMPGTQrr")>; + +def SBWriteResGroup31 : SchedWriteRes<[SBPort23]> { + let Latency = 5; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup31], (instregex "MOVSX(16|32|64)rm(8|16|32)", + "MOVZX(16|32|64)rm(8|16)")>; + +def SBWriteResGroup33 : SchedWriteRes<[SBPort4,SBPort23]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup33], (instregex "PUSH(16r|32r|64r|64i8)")>; + +def SBWriteResGroup35 : SchedWriteRes<[SBPort1,SBPort5]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SBWriteResGroup35], (instrs CLI)>; + +def SBWriteResGroup35_2 : SchedWriteRes<[SBPort1,SBPort4,SBPort23]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup35_2], (instregex "ISTT_FP(16|32|64)m", + "PUSHGS64")>; + +def SBWriteResGroup36 : SchedWriteRes<[SBPort4,SBPort5,SBPort23]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup36], (instrs CALL64pcrel32)>; +def: InstRW<[SBWriteResGroup36], (instregex "CALL(16|32|64)r", + "(V?)EXTRACTPSmr")>; + +def SBWriteResGroup40 : SchedWriteRes<[SBPort4,SBPort23,SBPort015]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup40], (instrs STOSB, STOSL, STOSQ, STOSW)>; + +def SBWriteResGroup41 : SchedWriteRes<[SBPort5,SBPort015]> { + let Latency = 5; + let NumMicroOps = 4; + let ResourceCycles = [1,3]; +} +def: InstRW<[SBWriteResGroup41], (instrs FNINIT)>; + +def SBWriteResGroup42 : SchedWriteRes<[SBPort05,SBPort015]> { + let Latency = 5; + let NumMicroOps = 4; + let ResourceCycles = [1,3]; +} +def: InstRW<[SBWriteResGroup42], (instregex "CMPXCHG(8|16|32|64)rr")>; + +def SBWriteResGroup43 : SchedWriteRes<[SBPort4,SBPort23,SBPort05]> { + let Latency = 3; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; +} +def: InstRW<[SBWriteResGroup43], (instregex "SET(A|BE)m")>; + +def SBWriteResGroup45 : SchedWriteRes<[SBPort0,SBPort4,SBPort23,SBPort15]> { + let Latency = 5; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SBWriteResGroup45], (instregex "(V?)PEXTR(D|Q)mr", + "PUSHF(16|64)")>; + +def SBWriteResGroup46 : SchedWriteRes<[SBPort4,SBPort5,SBPort01,SBPort23]> { + let Latency = 5; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SBWriteResGroup46], (instregex "CLFLUSH")>; + +def SBWriteResGroup47 : SchedWriteRes<[SBPort4,SBPort5,SBPort01,SBPort23]> { + let Latency = 5; + let NumMicroOps = 5; + let ResourceCycles = [1,2,1,1]; +} +def: InstRW<[SBWriteResGroup47], (instregex "FXRSTOR")>; + +def SBWriteResGroup48 : SchedWriteRes<[SBPort23]> { + let Latency = 6; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup48], (instregex "MMX_MOVD64from64rm", + "POP(16|32|64)r", + "VBROADCASTSSrm", + "(V?)MOV64toPQIrm", + "(V?)MOVDDUPrm", + "(V?)MOVDI2PDIrm", + "(V?)MOVQI2PQIrm", + "(V?)MOVSDrm", + "(V?)MOVSHDUPrm", + "(V?)MOVSLDUPrm", + "(V?)MOVSSrm")>; + +def SBWriteResGroup49 : SchedWriteRes<[SBPort5,SBPort23]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup49], (instregex "MOV16sm")>; + +def SBWriteResGroup50 : SchedWriteRes<[SBPort23,SBPort05]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup50], (instregex "BT(16|32|64)mi8")>; + +def SBWriteResGroup51 : SchedWriteRes<[SBPort23,SBPort15]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup51], (instregex "MMX_PABS(B|D|W)rm", + "MMX_PALIGNRrmi", + "MMX_PSIGN(B|D|W)rm")>; + +def SBWriteResGroup52 : SchedWriteRes<[SBPort23,SBPort015]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup52], (instrs LODSL, LODSQ)>; + +def SBWriteResGroup53 : SchedWriteRes<[SBPort4,SBPort23]> { + let Latency = 6; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SBWriteResGroup53], (instregex "ST_F(32|64)m", + "ST_FP(32|64|80)m")>; + +def SBWriteResGroup54 : SchedWriteRes<[SBPort23]> { + let Latency = 7; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup54], (instregex "VBROADCASTSDYrm", + "VBROADCASTSSYrm", + "VMOVDDUPYrm", + "VMOVSHDUPYrm", + "VMOVSLDUPYrm")>; + +def SBWriteResGroup58 : SchedWriteRes<[SBPort23,SBPort05]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup58], (instrs VINSERTF128rm)>; + +def SBWriteResGroup59 : SchedWriteRes<[SBPort23,SBPort15]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup59], (instregex "MMX_PADDQirm")>; + +def SBWriteResGroup62 : SchedWriteRes<[SBPort5,SBPort23]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SBWriteResGroup62], (instregex "VER(R|W)m")>; + +def SBWriteResGroup63 : SchedWriteRes<[SBPort23,SBPort015]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SBWriteResGroup63], (instrs LODSB, LODSW)>; + +def SBWriteResGroup64 : SchedWriteRes<[SBPort5,SBPort01,SBPort23]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup64], (instrs FARJMP64)>; + +def SBWriteResGroup66 : SchedWriteRes<[SBPort0,SBPort4,SBPort23]> { + let Latency = 7; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; +} +def: InstRW<[SBWriteResGroup66], (instrs FNSTSWm)>; + +def SBWriteResGroup67 : SchedWriteRes<[SBPort1,SBPort5,SBPort015]> { + let Latency = 7; + let NumMicroOps = 4; + let ResourceCycles = [1,2,1]; +} +def: InstRW<[SBWriteResGroup67], (instregex "SLDT(16|32|64)r", + "STR(16|32|64)r")>; + +def SBWriteResGroup68 : SchedWriteRes<[SBPort4,SBPort5,SBPort23]> { + let Latency = 7; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; +} +def: InstRW<[SBWriteResGroup68], (instrs FNSTCW16m)>; +def: InstRW<[SBWriteResGroup68], (instregex "CALL(16|32|64)m")>; + +def SBWriteResGroup69 : SchedWriteRes<[SBPort4,SBPort23,SBPort05]> { + let Latency = 7; + let NumMicroOps = 4; + let ResourceCycles = [1,2,1]; +} +def: InstRW<[SBWriteResGroup69], (instregex "BTC(16|32|64)mi8", + "BTR(16|32|64)mi8", + "BTS(16|32|64)mi8", + "SAR(8|16|32|64)m1", + "SAR(8|16|32|64)mi", + "SHL(8|16|32|64)m1", + "SHL(8|16|32|64)mi", + "SHR(8|16|32|64)m1", + "SHR(8|16|32|64)mi")>; + +def SBWriteResGroup77 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> { + let Latency = 8; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup77], (instregex "(V?)(U?)COMI(SD|SS)rm")>; + +def SBWriteResGroup81 : SchedWriteRes<[SBPort23,SBPort015]> { + let Latency = 8; + let NumMicroOps = 4; + let ResourceCycles = [1,3]; +} +def: InstRW<[SBWriteResGroup81], (instregex "CMPXCHG(8|16|32|64)rm")>; + +def SBWriteResGroup83 : SchedWriteRes<[SBPort23,SBPort015]> { + let Latency = 8; + let NumMicroOps = 5; + let ResourceCycles = [2,3]; +} +def: InstRW<[SBWriteResGroup83], (instrs CMPSB, + CMPSL, + CMPSQ, + CMPSW)>; + +def SBWriteResGroup84 : SchedWriteRes<[SBPort4,SBPort5,SBPort23]> { + let Latency = 8; + let NumMicroOps = 5; + let ResourceCycles = [1,2,2]; +} +def: InstRW<[SBWriteResGroup84], (instrs FLDCW16m)>; + +def SBWriteResGroup85 : SchedWriteRes<[SBPort4,SBPort23,SBPort05]> { + let Latency = 8; + let NumMicroOps = 5; + let ResourceCycles = [1,2,2]; +} +def: InstRW<[SBWriteResGroup85], (instregex "ROL(8|16|32|64)m1", + "ROL(8|16|32|64)mi", + "ROR(8|16|32|64)m1", + "ROR(8|16|32|64)mi")>; + +def SBWriteResGroup86 : SchedWriteRes<[SBPort4,SBPort23,SBPort015]> { + let Latency = 8; + let NumMicroOps = 5; + let ResourceCycles = [1,2,2]; +} +def: InstRW<[SBWriteResGroup86], (instrs MOVSB, MOVSL, MOVSQ, MOVSW)>; +def: InstRW<[SBWriteResGroup86], (instregex "XADD(8|16|32|64)rm")>; + +def SBWriteResGroup87 : SchedWriteRes<[SBPort4,SBPort5,SBPort01,SBPort23]> { + let Latency = 8; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,2]; +} +def: InstRW<[SBWriteResGroup87], (instrs FARCALL64)>; + +def SBWriteResGroup93 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup93], (instregex "CVT(T?)SD2SI(64)?rm", + "CVT(T?)SS2SI(64)?rm")>; + +def SBWriteResGroup93_1 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup93_1], (instrs IMUL64m, MUL64m)>; + +def SBWriteResGroup93_2 : SchedWriteRes<[SBPort1,SBPort23,SBPort05,SBPort015]> { + let Latency = 9; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SBWriteResGroup93_2], (instrs IMUL32m, MUL32m)>; + +def SBWriteResGroup93_3 : SchedWriteRes<[SBPort1,SBPort05,SBPort015,SBPort23]> { + let Latency = 9; + let NumMicroOps = 5; + let ResourceCycles = [1,1,2,1]; +} +def: InstRW<[SBWriteResGroup93_3], (instrs IMUL16m, MUL16m)>; + +def SBWriteResGroup93_4 : SchedWriteRes<[SBPort1,SBPort015,SBPort23]> { + let Latency = 8; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup93_4], (instrs IMUL16rmi, IMUL16rmi8)>; + +def SBWriteResGroup95 : SchedWriteRes<[SBPort5,SBPort01,SBPort23]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup95], (instregex "LD_F(32|64|80)m")>; + +def SBWriteResGroup97 : SchedWriteRes<[SBPort1,SBPort4,SBPort23]> { + let Latency = 9; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; +} +def: InstRW<[SBWriteResGroup97], (instregex "IST_F(16|32)m", + "IST_FP(16|32|64)m")>; + +def SBWriteResGroup97_2 : SchedWriteRes<[SBPort4,SBPort23,SBPort05]> { + let Latency = 9; + let NumMicroOps = 6; + let ResourceCycles = [1,2,3]; +} +def: InstRW<[SBWriteResGroup97_2], (instregex "ROL(8|16|32|64)mCL", + "ROR(8|16|32|64)mCL", + "SAR(8|16|32|64)mCL", + "SHL(8|16|32|64)mCL", + "SHR(8|16|32|64)mCL")>; + +def SBWriteResGroup98 : SchedWriteRes<[SBPort4,SBPort23,SBPort015]> { + let Latency = 9; + let NumMicroOps = 6; + let ResourceCycles = [1,2,3]; +} +def: SchedAlias<WriteADCRMW, SBWriteResGroup98>; + +def SBWriteResGroup99 : SchedWriteRes<[SBPort4,SBPort23,SBPort05,SBPort015]> { + let Latency = 9; + let NumMicroOps = 6; + let ResourceCycles = [1,2,2,1]; +} +def: InstRW<[SBWriteResGroup99, ReadAfterLd], (instrs ADC8mr, ADC16mr, ADC32mr, ADC64mr, + SBB8mr, SBB16mr, SBB32mr, SBB64mr)>; + +def SBWriteResGroup100 : SchedWriteRes<[SBPort4,SBPort5,SBPort23,SBPort05,SBPort015]> { + let Latency = 9; + let NumMicroOps = 6; + let ResourceCycles = [1,1,2,1,1]; +} +def: InstRW<[SBWriteResGroup100], (instregex "BT(16|32|64)mr", + "BTC(16|32|64)mr", + "BTR(16|32|64)mr", + "BTS(16|32|64)mr")>; + +def SBWriteResGroup101 : SchedWriteRes<[SBPort1,SBPort23]> { + let Latency = 10; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup101], (instregex "(ADD|SUB|SUBR)_F(32|64)m", + "ILD_F(16|32|64)m")>; + +def SBWriteResGroup104 : SchedWriteRes<[SBPort0,SBPort23]> { + let Latency = 11; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup104], (instregex "(V?)PCMPGTQrm")>; + +def SBWriteResGroup106 : SchedWriteRes<[SBPort1,SBPort23]> { + let Latency = 11; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SBWriteResGroup106], (instregex "FICOM(P?)(16|32)m")>; + +def SBWriteResGroup111 : SchedWriteRes<[SBPort0,SBPort23]> { + let Latency = 12; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup111], (instregex "MUL_F(32|64)m")>; + +def SBWriteResGroup114 : SchedWriteRes<[SBPort1,SBPort23]> { + let Latency = 13; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SBWriteResGroup114], (instregex "(ADD|SUB|SUBR)_FI(16|32)m")>; + +def SBWriteResGroup119 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> { + let Latency = 15; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup119], (instregex "MUL_FI(16|32)m")>; + +def SBWriteResGroup130 : SchedWriteRes<[SBPort0,SBPort23]> { + let Latency = 31; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup130], (instregex "DIV(R?)_F(32|64)m")>; + +def SBWriteResGroup131 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> { + let Latency = 34; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup131], (instregex "DIV(R?)_FI(16|32)m")>; + +def: InstRW<[WriteZero], (instrs CLC)>; + +} // SchedModel diff --git a/capstone/suite/synctools/tablegen/X86/back/X86SchedSkylakeClient.td b/capstone/suite/synctools/tablegen/X86/back/X86SchedSkylakeClient.td new file mode 100644 index 000000000..bda088e15 --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86SchedSkylakeClient.td @@ -0,0 +1,1850 @@ +//=- X86SchedSkylake.td - X86 Skylake Client Scheduling ------*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for Skylake Client to support +// instruction scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +def SkylakeClientModel : SchedMachineModel { + // All x86 instructions are modeled as a single micro-op, and SKylake can + // decode 6 instructions per cycle. + let IssueWidth = 6; + let MicroOpBufferSize = 224; // Based on the reorder buffer. + let LoadLatency = 5; + let MispredictPenalty = 14; + + // Based on the LSD (loop-stream detector) queue size and benchmarking data. + let LoopMicroOpBufferSize = 50; + + // This flag is set to allow the scheduler to assign a default model to + // unrecognized opcodes. + let CompleteModel = 0; +} + +let SchedModel = SkylakeClientModel in { + +// Skylake Client can issue micro-ops to 8 different ports in one cycle. + +// Ports 0, 1, 5, and 6 handle all computation. +// Port 4 gets the data half of stores. Store data can be available later than +// the store address, but since we don't model the latency of stores, we can +// ignore that. +// Ports 2 and 3 are identical. They handle loads and the address half of +// stores. Port 7 can handle address calculations. +def SKLPort0 : ProcResource<1>; +def SKLPort1 : ProcResource<1>; +def SKLPort2 : ProcResource<1>; +def SKLPort3 : ProcResource<1>; +def SKLPort4 : ProcResource<1>; +def SKLPort5 : ProcResource<1>; +def SKLPort6 : ProcResource<1>; +def SKLPort7 : ProcResource<1>; + +// Many micro-ops are capable of issuing on multiple ports. +def SKLPort01 : ProcResGroup<[SKLPort0, SKLPort1]>; +def SKLPort23 : ProcResGroup<[SKLPort2, SKLPort3]>; +def SKLPort237 : ProcResGroup<[SKLPort2, SKLPort3, SKLPort7]>; +def SKLPort04 : ProcResGroup<[SKLPort0, SKLPort4]>; +def SKLPort05 : ProcResGroup<[SKLPort0, SKLPort5]>; +def SKLPort06 : ProcResGroup<[SKLPort0, SKLPort6]>; +def SKLPort15 : ProcResGroup<[SKLPort1, SKLPort5]>; +def SKLPort16 : ProcResGroup<[SKLPort1, SKLPort6]>; +def SKLPort56 : ProcResGroup<[SKLPort5, SKLPort6]>; +def SKLPort015 : ProcResGroup<[SKLPort0, SKLPort1, SKLPort5]>; +def SKLPort056 : ProcResGroup<[SKLPort0, SKLPort5, SKLPort6]>; +def SKLPort0156: ProcResGroup<[SKLPort0, SKLPort1, SKLPort5, SKLPort6]>; + +def SKLDivider : ProcResource<1>; // Integer division issued on port 0. +// FP division and sqrt on port 0. +def SKLFPDivider : ProcResource<1>; + +// 60 Entry Unified Scheduler +def SKLPortAny : ProcResGroup<[SKLPort0, SKLPort1, SKLPort2, SKLPort3, SKLPort4, + SKLPort5, SKLPort6, SKLPort7]> { + let BufferSize=60; +} + +// Loads are 5 cycles, so ReadAfterLd registers needn't be available until 5 +// cycles after the memory operand. +def : ReadAdvance<ReadAfterLd, 5>; + +// Many SchedWrites are defined in pairs with and without a folded load. +// Instructions with folded loads are usually micro-fused, so they only appear +// as two micro-ops when queued in the reservation station. +// This multiclass defines the resource usage for variants with and without +// folded loads. +multiclass SKLWriteResPair<X86FoldableSchedWrite SchedRW, + list<ProcResourceKind> ExePorts, + int Lat, list<int> Res = [1], int UOps = 1, + int LoadLat = 5> { + // Register variant is using a single cycle on ExePort. + def : WriteRes<SchedRW, ExePorts> { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } + + // Memory variant also uses a cycle on port 2/3 and adds LoadLat cycles to + // the latency (default = 5). + def : WriteRes<SchedRW.Folded, !listconcat([SKLPort23], ExePorts)> { + let Latency = !add(Lat, LoadLat); + let ResourceCycles = !listconcat([1], Res); + let NumMicroOps = !add(UOps, 1); + } +} + +// A folded store needs a cycle on port 4 for the store data, and an extra port +// 2/3/7 cycle to recompute the address. +def : WriteRes<WriteRMW, [SKLPort237,SKLPort4]>; + +// Arithmetic. +defm : SKLWriteResPair<WriteALU, [SKLPort0156], 1>; // Simple integer ALU op. +defm : SKLWriteResPair<WriteADC, [SKLPort06], 1>; // Integer ALU + flags op. +defm : SKLWriteResPair<WriteIMul, [SKLPort1], 3>; // Integer multiplication. +defm : SKLWriteResPair<WriteIMul64, [SKLPort1], 3>; // Integer 64-bit multiplication. + +defm : X86WriteRes<WriteBSWAP32, [SKLPort15], 1, [1], 1>; +defm : X86WriteRes<WriteBSWAP64, [SKLPort06, SKLPort15], 2, [1,1], 2>; + +defm : SKLWriteResPair<WriteDiv8, [SKLPort0, SKLDivider], 25, [1,10], 1, 4>; +defm : SKLWriteResPair<WriteDiv16, [SKLPort0, SKLDivider], 25, [1,10], 1, 4>; +defm : SKLWriteResPair<WriteDiv32, [SKLPort0, SKLDivider], 25, [1,10], 1, 4>; +defm : SKLWriteResPair<WriteDiv64, [SKLPort0, SKLDivider], 25, [1,10], 1, 4>; +defm : SKLWriteResPair<WriteIDiv8, [SKLPort0, SKLDivider], 25, [1,10], 1, 4>; +defm : SKLWriteResPair<WriteIDiv16, [SKLPort0, SKLDivider], 25, [1,10], 1, 4>; +defm : SKLWriteResPair<WriteIDiv32, [SKLPort0, SKLDivider], 25, [1,10], 1, 4>; +defm : SKLWriteResPair<WriteIDiv64, [SKLPort0, SKLDivider], 25, [1,10], 1, 4>; + +defm : SKLWriteResPair<WriteCRC32, [SKLPort1], 3>; + +def : WriteRes<WriteIMulH, []> { let Latency = 3; } // Integer multiplication, high part. +def : WriteRes<WriteLEA, [SKLPort15]>; // LEA instructions can't fold loads. + +defm : SKLWriteResPair<WriteCMOV, [SKLPort06], 1, [1], 1>; // Conditional move. +defm : SKLWriteResPair<WriteCMOV2, [SKLPort06], 2, [2], 2>; // Conditional (CF + ZF flag) move. +defm : X86WriteRes<WriteFCMOV, [SKLPort1], 3, [1], 1>; // x87 conditional move. +def : WriteRes<WriteSETCC, [SKLPort06]>; // Setcc. +def : WriteRes<WriteSETCCStore, [SKLPort06,SKLPort4,SKLPort237]> { + let Latency = 2; + let NumMicroOps = 3; +} +def : WriteRes<WriteLAHFSAHF, [SKLPort06]>; +def : WriteRes<WriteBitTest,[SKLPort06]>; // + +// Bit counts. +defm : SKLWriteResPair<WriteBSF, [SKLPort1], 3>; +defm : SKLWriteResPair<WriteBSR, [SKLPort1], 3>; +defm : SKLWriteResPair<WriteLZCNT, [SKLPort1], 3>; +defm : SKLWriteResPair<WriteTZCNT, [SKLPort1], 3>; +defm : SKLWriteResPair<WritePOPCNT, [SKLPort1], 3>; + +// Integer shifts and rotates. +defm : SKLWriteResPair<WriteShift, [SKLPort06], 1>; + +// SHLD/SHRD. +defm : X86WriteRes<WriteSHDrri, [SKLPort1], 3, [1], 1>; +defm : X86WriteRes<WriteSHDrrcl,[SKLPort1,SKLPort06,SKLPort0156], 6, [1, 2, 1], 4>; +defm : X86WriteRes<WriteSHDmri, [SKLPort1,SKLPort23,SKLPort237,SKLPort0156], 9, [1, 1, 1, 1], 4>; +defm : X86WriteRes<WriteSHDmrcl,[SKLPort1,SKLPort23,SKLPort237,SKLPort06,SKLPort0156], 11, [1, 1, 1, 2, 1], 6>; + +// BMI1 BEXTR, BMI2 BZHI +defm : SKLWriteResPair<WriteBEXTR, [SKLPort06,SKLPort15], 2, [1,1], 2>; +defm : SKLWriteResPair<WriteBZHI, [SKLPort15], 1>; + +// Loads, stores, and moves, not folded with other operations. +defm : X86WriteRes<WriteLoad, [SKLPort23], 5, [1], 1>; +defm : X86WriteRes<WriteStore, [SKLPort237, SKLPort4], 1, [1,1], 1>; +defm : X86WriteRes<WriteStoreNT, [SKLPort237, SKLPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteMove, [SKLPort0156], 1, [1], 1>; + +// Idioms that clear a register, like xorps %xmm0, %xmm0. +// These can often bypass execution ports completely. +def : WriteRes<WriteZero, []>; + +// Branches don't produce values, so they have no latency, but they still +// consume resources. Indirect branches can fold loads. +defm : SKLWriteResPair<WriteJump, [SKLPort06], 1>; + +// Floating point. This covers both scalar and vector operations. +defm : X86WriteRes<WriteFLD0, [SKLPort05], 1, [1], 1>; +defm : X86WriteRes<WriteFLD1, [SKLPort05], 1, [2], 2>; +defm : X86WriteRes<WriteFLDC, [SKLPort05], 1, [2], 2>; +defm : X86WriteRes<WriteFLoad, [SKLPort23], 5, [1], 1>; +defm : X86WriteRes<WriteFLoadX, [SKLPort23], 6, [1], 1>; +defm : X86WriteRes<WriteFLoadY, [SKLPort23], 7, [1], 1>; +defm : X86WriteRes<WriteFMaskedLoad, [SKLPort23,SKLPort015], 7, [1,1], 2>; +defm : X86WriteRes<WriteFMaskedLoadY, [SKLPort23,SKLPort015], 8, [1,1], 2>; +defm : X86WriteRes<WriteFStore, [SKLPort237,SKLPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteFStoreX, [SKLPort237,SKLPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteFStoreY, [SKLPort237,SKLPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteFStoreNT, [SKLPort237,SKLPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteFStoreNTX, [SKLPort237,SKLPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteFStoreNTY, [SKLPort237,SKLPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteFMaskedStore, [SKLPort237,SKLPort0], 2, [1,1], 2>; +defm : X86WriteRes<WriteFMaskedStoreY, [SKLPort237,SKLPort0], 2, [1,1], 2>; +defm : X86WriteRes<WriteFMove, [SKLPort015], 1, [1], 1>; +defm : X86WriteRes<WriteFMoveX, [SKLPort015], 1, [1], 1>; +defm : X86WriteRes<WriteFMoveY, [SKLPort015], 1, [1], 1>; +defm : X86WriteRes<WriteEMMS, [SKLPort05,SKLPort0156], 10, [9,1], 10>; + +defm : SKLWriteResPair<WriteFAdd, [SKLPort01], 4, [1], 1, 5>; // Floating point add/sub. +defm : SKLWriteResPair<WriteFAddX, [SKLPort01], 4, [1], 1, 6>; +defm : SKLWriteResPair<WriteFAddY, [SKLPort01], 4, [1], 1, 7>; +defm : X86WriteResPairUnsupported<WriteFAddZ>; +defm : SKLWriteResPair<WriteFAdd64, [SKLPort01], 4, [1], 1, 5>; // Floating point double add/sub. +defm : SKLWriteResPair<WriteFAdd64X, [SKLPort01], 4, [1], 1, 6>; +defm : SKLWriteResPair<WriteFAdd64Y, [SKLPort01], 4, [1], 1, 7>; +defm : X86WriteResPairUnsupported<WriteFAdd64Z>; + +defm : SKLWriteResPair<WriteFCmp, [SKLPort01], 4, [1], 1, 5>; // Floating point compare. +defm : SKLWriteResPair<WriteFCmpX, [SKLPort01], 4, [1], 1, 6>; +defm : SKLWriteResPair<WriteFCmpY, [SKLPort01], 4, [1], 1, 7>; +defm : X86WriteResPairUnsupported<WriteFCmpZ>; +defm : SKLWriteResPair<WriteFCmp64, [SKLPort01], 4, [1], 1, 5>; // Floating point double compare. +defm : SKLWriteResPair<WriteFCmp64X, [SKLPort01], 4, [1], 1, 6>; +defm : SKLWriteResPair<WriteFCmp64Y, [SKLPort01], 4, [1], 1, 7>; +defm : X86WriteResPairUnsupported<WriteFCmp64Z>; + +defm : SKLWriteResPair<WriteFCom, [SKLPort0], 2>; // Floating point compare to flags. + +defm : SKLWriteResPair<WriteFMul, [SKLPort01], 4, [1], 1, 5>; // Floating point multiplication. +defm : SKLWriteResPair<WriteFMulX, [SKLPort01], 4, [1], 1, 6>; +defm : SKLWriteResPair<WriteFMulY, [SKLPort01], 4, [1], 1, 7>; +defm : X86WriteResPairUnsupported<WriteFMulZ>; +defm : SKLWriteResPair<WriteFMul64, [SKLPort01], 4, [1], 1, 5>; // Floating point double multiplication. +defm : SKLWriteResPair<WriteFMul64X, [SKLPort01], 4, [1], 1, 6>; +defm : SKLWriteResPair<WriteFMul64Y, [SKLPort01], 4, [1], 1, 7>; +defm : X86WriteResPairUnsupported<WriteFMul64Z>; + +defm : SKLWriteResPair<WriteFDiv, [SKLPort0,SKLFPDivider], 11, [1,3], 1, 5>; // Floating point division. +//defm : SKLWriteResPair<WriteFDivX, [SKLPort0,SKLFPDivider], 11, [1,3], 1, 6>; +defm : SKLWriteResPair<WriteFDivY, [SKLPort0,SKLFPDivider], 11, [1,5], 1, 7>; +defm : X86WriteResPairUnsupported<WriteFDivZ>; +//defm : SKLWriteResPair<WriteFDiv64, [SKLPort0,SKLFPDivider], 14, [1,3], 1, 5>; // Floating point double division. +//defm : SKLWriteResPair<WriteFDiv64X, [SKLPort0,SKLFPDivider], 14, [1,3], 1, 6>; +//defm : SKLWriteResPair<WriteFDiv64Y, [SKLPort0,SKLFPDivider], 14, [1,5], 1, 7>; +defm : X86WriteResPairUnsupported<WriteFDiv64Z>; + +defm : SKLWriteResPair<WriteFSqrt, [SKLPort0,SKLFPDivider], 12, [1,3], 1, 5>; // Floating point square root. +defm : SKLWriteResPair<WriteFSqrtX, [SKLPort0,SKLFPDivider], 12, [1,3], 1, 6>; +defm : SKLWriteResPair<WriteFSqrtY, [SKLPort0,SKLFPDivider], 12, [1,6], 1, 7>; +defm : X86WriteResPairUnsupported<WriteFSqrtZ>; +defm : SKLWriteResPair<WriteFSqrt64, [SKLPort0,SKLFPDivider], 18, [1,6], 1, 5>; // Floating point double square root. +defm : SKLWriteResPair<WriteFSqrt64X, [SKLPort0,SKLFPDivider], 18, [1,6], 1, 6>; +defm : SKLWriteResPair<WriteFSqrt64Y, [SKLPort0,SKLFPDivider], 18, [1,12],1, 7>; +defm : X86WriteResPairUnsupported<WriteFSqrt64Z>; +defm : SKLWriteResPair<WriteFSqrt80, [SKLPort0,SKLFPDivider], 21, [1,7]>; // Floating point long double square root. + +defm : SKLWriteResPair<WriteFRcp, [SKLPort0], 4, [1], 1, 5>; // Floating point reciprocal estimate. +defm : SKLWriteResPair<WriteFRcpX, [SKLPort0], 4, [1], 1, 6>; +defm : SKLWriteResPair<WriteFRcpY, [SKLPort0], 4, [1], 1, 7>; +defm : X86WriteResPairUnsupported<WriteFRcpZ>; + +defm : SKLWriteResPair<WriteFRsqrt, [SKLPort0], 4, [1], 1, 5>; // Floating point reciprocal square root estimate. +defm : SKLWriteResPair<WriteFRsqrtX,[SKLPort0], 4, [1], 1, 6>; +defm : SKLWriteResPair<WriteFRsqrtY,[SKLPort0], 4, [1], 1, 7>; +defm : X86WriteResPairUnsupported<WriteFRsqrtZ>; + +defm : SKLWriteResPair<WriteFMA, [SKLPort01], 4, [1], 1, 5>; // Fused Multiply Add. +defm : SKLWriteResPair<WriteFMAX, [SKLPort01], 4, [1], 1, 6>; +defm : SKLWriteResPair<WriteFMAY, [SKLPort01], 4, [1], 1, 7>; +defm : X86WriteResPairUnsupported<WriteFMAZ>; +defm : SKLWriteResPair<WriteDPPD, [SKLPort5,SKLPort01], 9, [1,2], 3, 6>; // Floating point double dot product. +defm : SKLWriteResPair<WriteDPPS, [SKLPort5,SKLPort01], 13, [1,3], 4, 6>; +defm : SKLWriteResPair<WriteDPPSY, [SKLPort5,SKLPort01], 13, [1,3], 4, 7>; +defm : X86WriteResPairUnsupported<WriteDPPSZ>; +defm : SKLWriteResPair<WriteFSign, [SKLPort0], 1>; // Floating point fabs/fchs. +defm : SKLWriteResPair<WriteFRnd, [SKLPort01], 8, [2], 2, 6>; // Floating point rounding. +defm : SKLWriteResPair<WriteFRndY, [SKLPort01], 8, [2], 2, 7>; +defm : X86WriteResPairUnsupported<WriteFRndZ>; +defm : SKLWriteResPair<WriteFLogic, [SKLPort015], 1, [1], 1, 6>; // Floating point and/or/xor logicals. +defm : SKLWriteResPair<WriteFLogicY, [SKLPort015], 1, [1], 1, 7>; +defm : X86WriteResPairUnsupported<WriteFLogicZ>; +defm : SKLWriteResPair<WriteFTest, [SKLPort0], 2, [1], 1, 6>; // Floating point TEST instructions. +defm : SKLWriteResPair<WriteFTestY, [SKLPort0], 2, [1], 1, 7>; +defm : X86WriteResPairUnsupported<WriteFTestZ>; +defm : SKLWriteResPair<WriteFShuffle, [SKLPort5], 1, [1], 1, 6>; // Floating point vector shuffles. +defm : SKLWriteResPair<WriteFShuffleY, [SKLPort5], 1, [1], 1, 7>; +defm : X86WriteResPairUnsupported<WriteFShuffleZ>; +defm : SKLWriteResPair<WriteFVarShuffle, [SKLPort5], 1, [1], 1, 6>; // Floating point vector shuffles. +defm : SKLWriteResPair<WriteFVarShuffleY, [SKLPort5], 1, [1], 1, 7>; +defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>; +defm : SKLWriteResPair<WriteFBlend, [SKLPort015], 1, [1], 1, 6>; // Floating point vector blends. +defm : SKLWriteResPair<WriteFBlendY, [SKLPort015], 1, [1], 1, 7>; +defm : X86WriteResPairUnsupported<WriteFBlendZ>; +defm : SKLWriteResPair<WriteFVarBlend, [SKLPort015], 2, [2], 2, 6>; // Fp vector variable blends. +defm : SKLWriteResPair<WriteFVarBlendY,[SKLPort015], 2, [2], 2, 7>; +defm : X86WriteResPairUnsupported<WriteFVarBlendZ>; + +// FMA Scheduling helper class. +// class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; } + +// Vector integer operations. +defm : X86WriteRes<WriteVecLoad, [SKLPort23], 5, [1], 1>; +defm : X86WriteRes<WriteVecLoadX, [SKLPort23], 6, [1], 1>; +defm : X86WriteRes<WriteVecLoadY, [SKLPort23], 7, [1], 1>; +defm : X86WriteRes<WriteVecLoadNT, [SKLPort23], 6, [1], 1>; +defm : X86WriteRes<WriteVecLoadNTY, [SKLPort23], 7, [1], 1>; +defm : X86WriteRes<WriteVecMaskedLoad, [SKLPort23,SKLPort015], 7, [1,1], 2>; +defm : X86WriteRes<WriteVecMaskedLoadY, [SKLPort23,SKLPort015], 8, [1,1], 2>; +defm : X86WriteRes<WriteVecStore, [SKLPort237,SKLPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteVecStoreX, [SKLPort237,SKLPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteVecStoreY, [SKLPort237,SKLPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteVecStoreNT, [SKLPort237,SKLPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteVecStoreNTY, [SKLPort237,SKLPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteVecMaskedStore, [SKLPort237,SKLPort0], 2, [1,1], 2>; +defm : X86WriteRes<WriteVecMaskedStoreY, [SKLPort237,SKLPort0], 2, [1,1], 2>; +defm : X86WriteRes<WriteVecMove, [SKLPort05], 1, [1], 1>; +defm : X86WriteRes<WriteVecMoveX, [SKLPort015], 1, [1], 1>; +defm : X86WriteRes<WriteVecMoveY, [SKLPort015], 1, [1], 1>; +defm : X86WriteRes<WriteVecMoveToGpr, [SKLPort0], 2, [1], 1>; +defm : X86WriteRes<WriteVecMoveFromGpr, [SKLPort5], 1, [1], 1>; + +defm : SKLWriteResPair<WriteVecALU, [SKLPort05], 1, [1], 1, 5>; // Vector integer ALU op, no logicals. +defm : SKLWriteResPair<WriteVecALUX, [SKLPort01], 1, [1], 1, 6>; +defm : SKLWriteResPair<WriteVecALUY, [SKLPort01], 1, [1], 1, 7>; +defm : X86WriteResPairUnsupported<WriteVecALUZ>; +defm : SKLWriteResPair<WriteVecLogic, [SKLPort05], 1, [1], 1, 5>; // Vector integer and/or/xor. +defm : SKLWriteResPair<WriteVecLogicX,[SKLPort015], 1, [1], 1, 6>; +defm : SKLWriteResPair<WriteVecLogicY,[SKLPort015], 1, [1], 1, 7>; +defm : X86WriteResPairUnsupported<WriteVecLogicZ>; +defm : SKLWriteResPair<WriteVecTest, [SKLPort0,SKLPort5], 3, [1,1], 2, 6>; // Vector integer TEST instructions. +defm : SKLWriteResPair<WriteVecTestY, [SKLPort0,SKLPort5], 3, [1,1], 2, 7>; +defm : X86WriteResPairUnsupported<WriteVecTestZ>; +defm : SKLWriteResPair<WriteVecIMul, [SKLPort0] , 4, [1], 1, 5>; // Vector integer multiply. +defm : SKLWriteResPair<WriteVecIMulX, [SKLPort01], 4, [1], 1, 6>; +defm : SKLWriteResPair<WriteVecIMulY, [SKLPort01], 4, [1], 1, 7>; +defm : X86WriteResPairUnsupported<WriteVecIMulZ>; +defm : SKLWriteResPair<WritePMULLD, [SKLPort01], 10, [2], 2, 6>; // Vector PMULLD. +defm : SKLWriteResPair<WritePMULLDY, [SKLPort01], 10, [2], 2, 7>; +defm : X86WriteResPairUnsupported<WritePMULLDZ>; +defm : SKLWriteResPair<WriteShuffle, [SKLPort5], 1, [1], 1, 5>; // Vector shuffles. +defm : SKLWriteResPair<WriteShuffleX, [SKLPort5], 1, [1], 1, 6>; +defm : SKLWriteResPair<WriteShuffleY, [SKLPort5], 1, [1], 1, 7>; +defm : X86WriteResPairUnsupported<WriteShuffleZ>; +defm : SKLWriteResPair<WriteVarShuffle, [SKLPort5], 1, [1], 1, 5>; // Vector shuffles. +defm : SKLWriteResPair<WriteVarShuffleX, [SKLPort5], 1, [1], 1, 6>; +defm : SKLWriteResPair<WriteVarShuffleY, [SKLPort5], 1, [1], 1, 7>; +defm : X86WriteResPairUnsupported<WriteVarShuffleZ>; +defm : SKLWriteResPair<WriteBlend, [SKLPort5], 1, [1], 1, 6>; // Vector blends. +defm : SKLWriteResPair<WriteBlendY, [SKLPort5], 1, [1], 1, 7>; +defm : X86WriteResPairUnsupported<WriteBlendZ>; +defm : SKLWriteResPair<WriteVarBlend, [SKLPort015], 2, [2], 2, 6>; // Vector variable blends. +defm : SKLWriteResPair<WriteVarBlendY, [SKLPort015], 2, [2], 2, 6>; +defm : X86WriteResPairUnsupported<WriteVarBlendZ>; +defm : SKLWriteResPair<WriteMPSAD, [SKLPort5], 4, [2], 2, 6>; // Vector MPSAD. +defm : SKLWriteResPair<WriteMPSADY, [SKLPort5], 4, [2], 2, 7>; +defm : X86WriteResPairUnsupported<WriteMPSADZ>; +defm : SKLWriteResPair<WritePSADBW, [SKLPort5], 3, [1], 1, 5>; // Vector PSADBW. +defm : SKLWriteResPair<WritePSADBWX, [SKLPort5], 3, [1], 1, 6>; +defm : SKLWriteResPair<WritePSADBWY, [SKLPort5], 3, [1], 1, 7>; +defm : X86WriteResPairUnsupported<WritePSADBWZ>; +defm : SKLWriteResPair<WritePHMINPOS, [SKLPort01], 4, [1], 1, 6>; // Vector PHMINPOS. + +// Vector integer shifts. +defm : SKLWriteResPair<WriteVecShift, [SKLPort0], 1, [1], 1, 5>; +defm : X86WriteRes<WriteVecShiftX, [SKLPort5,SKLPort01], 2, [1,1], 2>; +defm : X86WriteRes<WriteVecShiftY, [SKLPort5,SKLPort01], 4, [1,1], 2>; +defm : X86WriteRes<WriteVecShiftXLd, [SKLPort01,SKLPort23], 7, [1,1], 2>; +defm : X86WriteRes<WriteVecShiftYLd, [SKLPort01,SKLPort23], 8, [1,1], 2>; +defm : X86WriteResPairUnsupported<WriteVecShiftZ>; + +defm : SKLWriteResPair<WriteVecShiftImm, [SKLPort0], 1, [1], 1, 5>; // Vector integer immediate shifts. +defm : SKLWriteResPair<WriteVecShiftImmX, [SKLPort01], 1, [1], 1, 6>; +defm : SKLWriteResPair<WriteVecShiftImmY, [SKLPort01], 1, [1], 1, 7>; +defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>; +defm : SKLWriteResPair<WriteVarVecShift, [SKLPort01], 1, [1], 1, 6>; // Variable vector shifts. +defm : SKLWriteResPair<WriteVarVecShiftY, [SKLPort01], 1, [1], 1, 7>; +defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>; + +// Vector insert/extract operations. +def : WriteRes<WriteVecInsert, [SKLPort5]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def : WriteRes<WriteVecInsertLd, [SKLPort5,SKLPort23]> { + let Latency = 6; + let NumMicroOps = 2; +} +def: InstRW<[WriteVecInsertLd], (instregex "(V?)MOV(H|L)(PD|PS)rm")>; + +def : WriteRes<WriteVecExtract, [SKLPort0,SKLPort5]> { + let Latency = 3; + let NumMicroOps = 2; +} +def : WriteRes<WriteVecExtractSt, [SKLPort4,SKLPort5,SKLPort237]> { + let Latency = 2; + let NumMicroOps = 3; +} + +// Conversion between integer and float. +defm : SKLWriteResPair<WriteCvtSS2I, [SKLPort1], 3>; +defm : SKLWriteResPair<WriteCvtPS2I, [SKLPort1], 3>; +defm : SKLWriteResPair<WriteCvtPS2IY, [SKLPort1], 3>; +defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>; +defm : SKLWriteResPair<WriteCvtSD2I, [SKLPort1], 3>; +defm : SKLWriteResPair<WriteCvtPD2I, [SKLPort1], 3>; +defm : SKLWriteResPair<WriteCvtPD2IY, [SKLPort1], 3>; +defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>; + +defm : SKLWriteResPair<WriteCvtI2SS, [SKLPort1], 4>; +defm : SKLWriteResPair<WriteCvtI2PS, [SKLPort1], 4>; +defm : SKLWriteResPair<WriteCvtI2PSY, [SKLPort1], 4>; +defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>; +defm : SKLWriteResPair<WriteCvtI2SD, [SKLPort1], 4>; +defm : SKLWriteResPair<WriteCvtI2PD, [SKLPort1], 4>; +defm : SKLWriteResPair<WriteCvtI2PDY, [SKLPort1], 4>; +defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>; + +defm : SKLWriteResPair<WriteCvtSS2SD, [SKLPort1], 3>; +defm : SKLWriteResPair<WriteCvtPS2PD, [SKLPort1], 3>; +defm : SKLWriteResPair<WriteCvtPS2PDY, [SKLPort1], 3>; +defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>; +defm : SKLWriteResPair<WriteCvtSD2SS, [SKLPort1], 3>; +defm : SKLWriteResPair<WriteCvtPD2PS, [SKLPort1], 3>; +defm : SKLWriteResPair<WriteCvtPD2PSY, [SKLPort1], 3>; +defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>; + +defm : X86WriteRes<WriteCvtPH2PS, [SKLPort5,SKLPort015], 5, [1,1], 2>; +defm : X86WriteRes<WriteCvtPH2PSY, [SKLPort5,SKLPort01], 7, [1,1], 2>; +defm : X86WriteResUnsupported<WriteCvtPH2PSZ>; +defm : X86WriteRes<WriteCvtPH2PSLd, [SKLPort23,SKLPort01], 9, [1,1], 2>; +defm : X86WriteRes<WriteCvtPH2PSYLd, [SKLPort23,SKLPort01], 10, [1,1], 2>; +defm : X86WriteResUnsupported<WriteCvtPH2PSZLd>; + +defm : X86WriteRes<WriteCvtPS2PH, [SKLPort5,SKLPort015], 5, [1,1], 2>; +defm : X86WriteRes<WriteCvtPS2PHY, [SKLPort5,SKLPort01], 7, [1,1], 2>; +defm : X86WriteResUnsupported<WriteCvtPS2PHZ>; +defm : X86WriteRes<WriteCvtPS2PHSt, [SKLPort4,SKLPort5,SKLPort237,SKLPort01], 6, [1,1,1,1], 4>; +defm : X86WriteRes<WriteCvtPS2PHYSt, [SKLPort4,SKLPort5,SKLPort237,SKLPort01], 8, [1,1,1,1], 4>; +defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>; + +// Strings instructions. + +// Packed Compare Implicit Length Strings, Return Mask +def : WriteRes<WritePCmpIStrM, [SKLPort0]> { + let Latency = 10; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def : WriteRes<WritePCmpIStrMLd, [SKLPort0, SKLPort23]> { + let Latency = 16; + let NumMicroOps = 4; + let ResourceCycles = [3,1]; +} + +// Packed Compare Explicit Length Strings, Return Mask +def : WriteRes<WritePCmpEStrM, [SKLPort0, SKLPort5, SKLPort015, SKLPort0156]> { + let Latency = 19; + let NumMicroOps = 9; + let ResourceCycles = [4,3,1,1]; +} +def : WriteRes<WritePCmpEStrMLd, [SKLPort0, SKLPort5,SKLPort23, SKLPort015, SKLPort0156]> { + let Latency = 25; + let NumMicroOps = 10; + let ResourceCycles = [4,3,1,1,1]; +} + +// Packed Compare Implicit Length Strings, Return Index +def : WriteRes<WritePCmpIStrI, [SKLPort0]> { + let Latency = 10; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def : WriteRes<WritePCmpIStrILd, [SKLPort0, SKLPort23]> { + let Latency = 16; + let NumMicroOps = 4; + let ResourceCycles = [3,1]; +} + +// Packed Compare Explicit Length Strings, Return Index +def : WriteRes<WritePCmpEStrI, [SKLPort0, SKLPort5, SKLPort0156]> { + let Latency = 18; + let NumMicroOps = 8; + let ResourceCycles = [4,3,1]; +} +def : WriteRes<WritePCmpEStrILd, [SKLPort0, SKLPort5, SKLPort23, SKLPort0156]> { + let Latency = 24; + let NumMicroOps = 9; + let ResourceCycles = [4,3,1,1]; +} + +// MOVMSK Instructions. +def : WriteRes<WriteFMOVMSK, [SKLPort0]> { let Latency = 2; } +def : WriteRes<WriteVecMOVMSK, [SKLPort0]> { let Latency = 2; } +def : WriteRes<WriteVecMOVMSKY, [SKLPort0]> { let Latency = 2; } +def : WriteRes<WriteMMXMOVMSK, [SKLPort0]> { let Latency = 2; } + +// AES instructions. +def : WriteRes<WriteAESDecEnc, [SKLPort0]> { // Decryption, encryption. + let Latency = 4; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def : WriteRes<WriteAESDecEncLd, [SKLPort0, SKLPort23]> { + let Latency = 10; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} + +def : WriteRes<WriteAESIMC, [SKLPort0]> { // InvMixColumn. + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def : WriteRes<WriteAESIMCLd, [SKLPort0, SKLPort23]> { + let Latency = 14; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} + +def : WriteRes<WriteAESKeyGen, [SKLPort0, SKLPort5, SKLPort015]> { // Key Generation. + let Latency = 20; + let NumMicroOps = 11; + let ResourceCycles = [3,6,2]; +} +def : WriteRes<WriteAESKeyGenLd, [SKLPort0, SKLPort5, SKLPort23, SKLPort015]> { + let Latency = 25; + let NumMicroOps = 11; + let ResourceCycles = [3,6,1,1]; +} + +// Carry-less multiplication instructions. +def : WriteRes<WriteCLMul, [SKLPort5]> { + let Latency = 6; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def : WriteRes<WriteCLMulLd, [SKLPort5, SKLPort23]> { + let Latency = 12; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} + +// Catch-all for expensive system instructions. +def : WriteRes<WriteSystem, [SKLPort0156]> { let Latency = 100; } // def WriteSystem : SchedWrite; + +// AVX2. +defm : SKLWriteResPair<WriteFShuffle256, [SKLPort5], 3, [1], 1, 7>; // Fp 256-bit width vector shuffles. +defm : SKLWriteResPair<WriteFVarShuffle256, [SKLPort5], 3, [1], 1, 7>; // Fp 256-bit width vector variable shuffles. +defm : SKLWriteResPair<WriteShuffle256, [SKLPort5], 3, [1], 1, 7>; // 256-bit width vector shuffles. +defm : SKLWriteResPair<WriteVarShuffle256, [SKLPort5], 3, [1], 1, 7>; // 256-bit width vector variable shuffles. + +// Old microcoded instructions that nobody use. +def : WriteRes<WriteMicrocoded, [SKLPort0156]> { let Latency = 100; } // def WriteMicrocoded : SchedWrite; + +// Fence instructions. +def : WriteRes<WriteFence, [SKLPort23, SKLPort4]>; + +// Load/store MXCSR. +def : WriteRes<WriteLDMXCSR, [SKLPort0,SKLPort23,SKLPort0156]> { let Latency = 7; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; } +def : WriteRes<WriteSTMXCSR, [SKLPort4,SKLPort5,SKLPort237]> { let Latency = 2; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; } + +// Nop, not very useful expect it provides a model for nops! +def : WriteRes<WriteNop, []>; + +//////////////////////////////////////////////////////////////////////////////// +// Horizontal add/sub instructions. +//////////////////////////////////////////////////////////////////////////////// + +defm : SKLWriteResPair<WriteFHAdd, [SKLPort5,SKLPort01], 6, [2,1], 3, 6>; +defm : SKLWriteResPair<WriteFHAddY, [SKLPort5,SKLPort01], 6, [2,1], 3, 7>; +defm : SKLWriteResPair<WritePHAdd, [SKLPort5,SKLPort05], 3, [2,1], 3, 5>; +defm : SKLWriteResPair<WritePHAddX, [SKLPort5,SKLPort015], 3, [2,1], 3, 6>; +defm : SKLWriteResPair<WritePHAddY, [SKLPort5,SKLPort015], 3, [2,1], 3, 7>; + +// Remaining instrs. + +def SKLWriteResGroup1 : SchedWriteRes<[SKLPort0]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PADDS(B|W)irr", + "MMX_PADDUS(B|W)irr", + "MMX_PAVG(B|W)irr", + "MMX_PCMPEQ(B|D|W)irr", + "MMX_PCMPGT(B|D|W)irr", + "MMX_P(MAX|MIN)SWirr", + "MMX_P(MAX|MIN)UBirr", + "MMX_PSUBS(B|W)irr", + "MMX_PSUBUS(B|W)irr")>; + +def SKLWriteResGroup3 : SchedWriteRes<[SKLPort5]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup3], (instregex "COM(P?)_FST0r", + "UCOM_F(P?)r")>; + +def SKLWriteResGroup4 : SchedWriteRes<[SKLPort6]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup4], (instregex "JMP(16|32|64)r")>; + +def SKLWriteResGroup6 : SchedWriteRes<[SKLPort05]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup6], (instrs FINCSTP, FNOP)>; + +def SKLWriteResGroup7 : SchedWriteRes<[SKLPort06]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup7], (instrs CDQ, CQO, CLAC, STAC)>; + +def SKLWriteResGroup8 : SchedWriteRes<[SKLPort15]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup8], (instregex "ANDN(32|64)rr", + "BLSI(32|64)rr", + "BLSMSK(32|64)rr", + "BLSR(32|64)rr")>; + +def SKLWriteResGroup9 : SchedWriteRes<[SKLPort015]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup9], (instregex "(V?)PADD(B|D|Q|W)(Y?)rr", + "VPBLENDD(Y?)rri", + "(V?)PSUB(B|D|Q|W)(Y?)rr")>; + +def SKLWriteResGroup10 : SchedWriteRes<[SKLPort0156]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup10], (instrs CBW, CWDE, CDQE, + CMC, STC)>; +def: InstRW<[SKLWriteResGroup10], (instregex "SGDT64m", + "SIDT64m", + "SMSW16m", + "STRm", + "SYSCALL")>; + +def SKLWriteResGroup11 : SchedWriteRes<[SKLPort4,SKLPort237]> { + let Latency = 1; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup11], (instregex "FBSTPm", + "ST_FP(32|64|80)m", + "VMPTRSTm")>; + +def SKLWriteResGroup13 : SchedWriteRes<[SKLPort5]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKLWriteResGroup13], (instregex "MMX_MOVQ2DQrr")>; + +def SKLWriteResGroup14 : SchedWriteRes<[SKLPort05]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKLWriteResGroup14], (instrs FDECSTP)>; +def: InstRW<[SKLWriteResGroup14], (instregex "MMX_MOVDQ2Qrr")>; + +def SKLWriteResGroup15 : SchedWriteRes<[SKLPort06]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKLWriteResGroup15], (instregex "ROL(8|16|32|64)r1", + "ROL(8|16|32|64)ri", + "ROR(8|16|32|64)r1", + "ROR(8|16|32|64)ri", + "SET(A|BE)r")>; + +def SKLWriteResGroup17 : SchedWriteRes<[SKLPort0156]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKLWriteResGroup17], (instrs LFENCE, + WAIT, + XGETBV)>; + +def SKLWriteResGroup20 : SchedWriteRes<[SKLPort6,SKLPort0156]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup20], (instregex "CLFLUSH")>; + +def SKLWriteResGroup21 : SchedWriteRes<[SKLPort237,SKLPort0156]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup21], (instrs SFENCE)>; + +def SKLWriteResGroup23 : SchedWriteRes<[SKLPort06,SKLPort0156]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup23], (instrs CWD)>; +def: InstRW<[SKLWriteResGroup23], (instrs JCXZ, JECXZ, JRCXZ)>; +def: InstRW<[SKLWriteResGroup23], (instregex "ADC8i8", + "ADC8ri", + "SBB8i8", + "SBB8ri")>; + +def SKLWriteResGroup25 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort237]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup25], (instrs FNSTCW16m)>; + +def SKLWriteResGroup27 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort15]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup27], (instregex "MOVBE(16|32|64)mr")>; + +def SKLWriteResGroup28 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort0156]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup28], (instrs PUSH16r, PUSH32r, PUSH64r, + STOSB, STOSL, STOSQ, STOSW)>; +def: InstRW<[SKLWriteResGroup28], (instregex "PUSH(16|32|64)rmr", + "PUSH64i8")>; + +def SKLWriteResGroup29 : SchedWriteRes<[SKLPort1]> { + let Latency = 3; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup29], (instregex "PDEP(32|64)rr", + "PEXT(32|64)rr")>; + +def SKLWriteResGroup29_16i : SchedWriteRes<[SKLPort1, SKLPort0156]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup29_16i], (instrs IMUL16rri, IMUL16rri8)>; + +def SKLWriteResGroup30 : SchedWriteRes<[SKLPort5]> { + let Latency = 3; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup30], (instregex "(ADD|SUB|SUBR)_(FPrST0|FST0r|FrST0)", + "VPBROADCASTBrr", + "VPBROADCASTWrr", + "(V?)PCMPGTQ(Y?)rr")>; + +def SKLWriteResGroup32 : SchedWriteRes<[SKLPort0,SKLPort0156]> { + let Latency = 3; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup32], (instrs FNSTSW16r)>; + +def SKLWriteResGroup33 : SchedWriteRes<[SKLPort06]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def: InstRW<[SKLWriteResGroup33], (instregex "ROL(8|16|32|64)rCL", + "ROR(8|16|32|64)rCL", + "SAR(8|16|32|64)rCL", + "SHL(8|16|32|64)rCL", + "SHR(8|16|32|64)rCL")>; + +def SKLWriteResGroup34 : SchedWriteRes<[SKLPort0156]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def: InstRW<[SKLWriteResGroup34], (instrs XADD8rr, XADD16rr, XADD32rr, XADD64rr, + XCHG8rr, XCHG16rr, XCHG32rr, XCHG64rr, + XCHG16ar, XCHG32ar, XCHG64ar)>; + +def SKLWriteResGroup35 : SchedWriteRes<[SKLPort0,SKLPort5]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKLWriteResGroup35], (instregex "MMX_PH(ADD|SUB)SWrr")>; + +def SKLWriteResGroup36 : SchedWriteRes<[SKLPort5,SKLPort01]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKLWriteResGroup36], (instregex "(V?)PHADDSW(Y?)rr", + "(V?)PHSUBSW(Y?)rr")>; + +def SKLWriteResGroup39 : SchedWriteRes<[SKLPort5,SKLPort0156]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKLWriteResGroup39], (instregex "MMX_PACKSSDWirr", + "MMX_PACKSSWBirr", + "MMX_PACKUSWBirr")>; + +def SKLWriteResGroup40 : SchedWriteRes<[SKLPort6,SKLPort0156]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKLWriteResGroup40], (instregex "CLD")>; + +def SKLWriteResGroup41 : SchedWriteRes<[SKLPort237,SKLPort0156]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKLWriteResGroup41], (instrs MFENCE)>; + +def SKLWriteResGroup42 : SchedWriteRes<[SKLPort06,SKLPort0156]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKLWriteResGroup42], (instregex "RCL(8|16|32|64)r1", + "RCL(8|16|32|64)ri", + "RCR(8|16|32|64)r1", + "RCR(8|16|32|64)ri")>; + +def SKLWriteResGroup43 : SchedWriteRes<[SKLPort0,SKLPort4,SKLPort237]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup43], (instrs FNSTSWm)>; + +def SKLWriteResGroup44 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort06]> { + let Latency = 3; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; +} +def: InstRW<[SKLWriteResGroup44], (instregex "SET(A|BE)m")>; + +def SKLWriteResGroup45 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort237,SKLPort0156]> { + let Latency = 3; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKLWriteResGroup45], (instregex "CALL(16|32|64)r")>; + +def SKLWriteResGroup46 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort06,SKLPort0156]> { + let Latency = 3; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKLWriteResGroup46], (instrs CALL64pcrel32)>; + +def SKLWriteResGroup47 : SchedWriteRes<[SKLPort0]> { + let Latency = 4; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup47], (instregex "MUL_(FPrST0|FST0r|FrST0)")>; + +def SKLWriteResGroup48 : SchedWriteRes<[SKLPort01]> { + let Latency = 4; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup48], (instregex "(V?)CVTDQ2PS(Y?)rr", + "(V?)CVT(T?)PS2DQ(Y?)rr")>; + +def SKLWriteResGroup51 : SchedWriteRes<[SKLPort1,SKLPort5]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup51], (instrs IMUL64r, MUL64r, MULX64rr)>; + +def SKLWriteResGroup51_16 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort0156]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; +} +def: InstRW<[SKLWriteResGroup51_16], (instrs IMUL16r, MUL16r)>; + +def SKLWriteResGroup53 : SchedWriteRes<[SKLPort4,SKLPort5,SKLPort237]> { + let Latency = 4; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup53], (instregex "IST(T?)_FP(16|32|64)m", + "IST_F(16|32)m")>; + +def SKLWriteResGroup54 : SchedWriteRes<[SKLPort0156]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [4]; +} +def: InstRW<[SKLWriteResGroup54], (instrs FNCLEX)>; + +def SKLWriteResGroup55 : SchedWriteRes<[SKLPort6,SKLPort0156]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [1,3]; +} +def: InstRW<[SKLWriteResGroup55], (instrs PAUSE)>; + +def SKLWriteResGroup56 : SchedWriteRes<[SKLPort015,SKLPort0156]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [1,3]; +} +def: InstRW<[SKLWriteResGroup56], (instrs VZEROUPPER)>; + +def SKLWriteResGroup57 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort0156]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; +} +def: InstRW<[SKLWriteResGroup57], (instregex "LAR(16|32|64)rr")>; + +def SKLWriteResGroup58 : SchedWriteRes<[SKLPort23]> { + let Latency = 5; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup58], (instregex "MOVSX(16|32|64)rm16", + "MOVSX(16|32|64)rm32", + "MOVSX(16|32|64)rm8", + "MOVZX(16|32|64)rm16", + "MOVZX(16|32|64)rm8", + "(V?)MOVDDUPrm")>; // TODO: Should this be SKLWriteResGroup67? + +def SKLWriteResGroup59 : SchedWriteRes<[SKLPort0,SKLPort5]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup59], (instregex "MMX_CVTPI2PDirr", + "(V?)CVTDQ2PDrr")>; + +def SKLWriteResGroup60 : SchedWriteRes<[SKLPort5,SKLPort015]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup60], (instregex "MMX_CVT(T?)PD2PIirr", + "MMX_CVT(T?)PS2PIirr", + "(V?)CVT(T?)PD2DQrr", + "(V?)CVTPD2PSrr", + "(V?)CVTPS2PDrr", + "(V?)CVTSD2SSrr", + "(V?)CVTSI642SDrr", + "(V?)CVTSI2SDrr", + "(V?)CVTSI2SSrr", + "(V?)CVTSS2SDrr")>; + +def SKLWriteResGroup61 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort06]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup61], (instregex "STR(16|32|64)r")>; + +def SKLWriteResGroup62 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort0156]> { + let Latency = 4; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup62], (instrs IMUL32r, MUL32r, MULX32rr)>; + +def SKLWriteResGroup63 : SchedWriteRes<[SKLPort06,SKLPort0156]> { + let Latency = 5; + let NumMicroOps = 5; + let ResourceCycles = [1,4]; +} +def: InstRW<[SKLWriteResGroup63], (instrs XSETBV)>; + +def SKLWriteResGroup64 : SchedWriteRes<[SKLPort06,SKLPort0156]> { + let Latency = 5; + let NumMicroOps = 5; + let ResourceCycles = [2,3]; +} +def: InstRW<[SKLWriteResGroup64], (instregex "CMPXCHG(8|16|32|64)rr")>; + +def SKLWriteResGroup65 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort0156]> { + let Latency = 5; + let NumMicroOps = 6; + let ResourceCycles = [1,1,4]; +} +def: InstRW<[SKLWriteResGroup65], (instregex "PUSHF(16|64)")>; + +def SKLWriteResGroup67 : SchedWriteRes<[SKLPort23]> { + let Latency = 6; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup67], (instregex "VBROADCASTSSrm", + "(V?)MOVSHDUPrm", + "(V?)MOVSLDUPrm", + "VPBROADCASTDrm", + "VPBROADCASTQrm")>; + +def SKLWriteResGroup68 : SchedWriteRes<[SKLPort0]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKLWriteResGroup68], (instregex "MMX_CVTPI2PSirr")>; + +def SKLWriteResGroup69 : SchedWriteRes<[SKLPort0,SKLPort23]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PADDSBirm", + "MMX_PADDSWirm", + "MMX_PADDUSBirm", + "MMX_PADDUSWirm", + "MMX_PAVGBirm", + "MMX_PAVGWirm", + "MMX_PCMPEQBirm", + "MMX_PCMPEQDirm", + "MMX_PCMPEQWirm", + "MMX_PCMPGTBirm", + "MMX_PCMPGTDirm", + "MMX_PCMPGTWirm", + "MMX_PMAXSWirm", + "MMX_PMAXUBirm", + "MMX_PMINSWirm", + "MMX_PMINUBirm", + "MMX_PSUBSBirm", + "MMX_PSUBSWirm", + "MMX_PSUBUSBirm", + "MMX_PSUBUSWirm")>; + +def SKLWriteResGroup70 : SchedWriteRes<[SKLPort0,SKLPort01]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup70], (instregex "(V?)CVTSS2SI(64)?rr", + "(V?)CVT(T?)SD2SI(64)?rr")>; + +def SKLWriteResGroup72 : SchedWriteRes<[SKLPort6,SKLPort23]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup72], (instregex "FARJMP64", + "JMP(16|32|64)m")>; + +def SKLWriteResGroup74 : SchedWriteRes<[SKLPort23,SKLPort06]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup74], (instregex "BT(16|32|64)mi8")>; + +def SKLWriteResGroup75 : SchedWriteRes<[SKLPort23,SKLPort15]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup75], (instregex "ANDN(32|64)rm", + "BLSI(32|64)rm", + "BLSMSK(32|64)rm", + "BLSR(32|64)rm", + "MOVBE(16|32|64)rm")>; + +def SKLWriteResGroup76 : SchedWriteRes<[SKLPort23,SKLPort0156]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup76], (instrs POP16r, POP32r, POP64r)>; +def: InstRW<[SKLWriteResGroup76], (instregex "POP(16|32|64)rmr")>; + +def SKLWriteResGroup78 : SchedWriteRes<[SKLPort5,SKLPort01]> { + let Latency = 6; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKLWriteResGroup78], (instregex "(V?)CVTSI642SSrr")>; + +def SKLWriteResGroup80 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort06,SKLPort0156]> { + let Latency = 6; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKLWriteResGroup80], (instregex "SLDT(16|32|64)r")>; + +def SKLWriteResGroup82 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06]> { + let Latency = 6; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKLWriteResGroup82], (instregex "BTC(16|32|64)mi8", + "BTR(16|32|64)mi8", + "BTS(16|32|64)mi8", + "SAR(8|16|32|64)m1", + "SAR(8|16|32|64)mi", + "SHL(8|16|32|64)m1", + "SHL(8|16|32|64)mi", + "SHR(8|16|32|64)m1", + "SHR(8|16|32|64)mi")>; + +def SKLWriteResGroup83 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort0156]> { + let Latency = 6; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKLWriteResGroup83], (instregex "POP(16|32|64)rmm", + "PUSH(16|32|64)rmm")>; + +def SKLWriteResGroup84 : SchedWriteRes<[SKLPort6,SKLPort0156]> { + let Latency = 6; + let NumMicroOps = 6; + let ResourceCycles = [1,5]; +} +def: InstRW<[SKLWriteResGroup84], (instrs STD)>; + +def SKLWriteResGroup85 : SchedWriteRes<[SKLPort23]> { + let Latency = 7; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup85], (instregex "LD_F(32|64|80)m", + "VBROADCASTF128", + "VBROADCASTI128", + "VBROADCASTSDYrm", + "VBROADCASTSSYrm", + "VMOVDDUPYrm", + "VMOVSHDUPYrm", + "VMOVSLDUPYrm", + "VPBROADCASTDYrm", + "VPBROADCASTQYrm")>; + +def SKLWriteResGroup86 : SchedWriteRes<[SKLPort0,SKLPort5]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup86], (instregex "VCVTDQ2PDYrr")>; + +def SKLWriteResGroup88 : SchedWriteRes<[SKLPort5,SKLPort23]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup88], (instregex "(V?)PMOV(SX|ZX)BDrm", + "(V?)PMOV(SX|ZX)BQrm", + "(V?)PMOV(SX|ZX)BWrm", + "(V?)PMOV(SX|ZX)DQrm", + "(V?)PMOV(SX|ZX)WDrm", + "(V?)PMOV(SX|ZX)WQrm")>; + +def SKLWriteResGroup89 : SchedWriteRes<[SKLPort5,SKLPort01]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup89], (instregex "VCVTPD2PSYrr", + "VCVTPS2PDYrr", + "VCVT(T?)PD2DQYrr")>; + +def SKLWriteResGroup91 : SchedWriteRes<[SKLPort23,SKLPort015]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup91], (instregex "(V?)INSERTF128rm", + "(V?)INSERTI128rm", + "(V?)PADD(B|D|Q|W)rm", + "(V?)PBLENDDrmi", + "(V?)PSUB(B|D|Q|W)rm")>; + +def SKLWriteResGroup92 : SchedWriteRes<[SKLPort5,SKLPort23]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKLWriteResGroup92], (instregex "MMX_PACKSSDWirm", + "MMX_PACKSSWBirm", + "MMX_PACKUSWBirm")>; + +def SKLWriteResGroup94 : SchedWriteRes<[SKLPort23,SKLPort0156]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKLWriteResGroup94], (instrs LEAVE, LEAVE64, + SCASB, SCASL, SCASQ, SCASW)>; + +def SKLWriteResGroup95 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort01]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup95], (instregex "(V?)CVTTSS2SI(64)?rr")>; + +def SKLWriteResGroup96 : SchedWriteRes<[SKLPort0,SKLPort23,SKLPort05]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup96], (instrs FLDCW16m)>; + +def SKLWriteResGroup98 : SchedWriteRes<[SKLPort6,SKLPort23,SKLPort0156]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup98], (instrs LRETQ, RETQ)>; + +def SKLWriteResGroup100 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06]> { + let Latency = 7; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,2]; +} +def: InstRW<[SKLWriteResGroup100], (instregex "ROL(8|16|32|64)m1", + "ROL(8|16|32|64)mi", + "ROR(8|16|32|64)m1", + "ROR(8|16|32|64)mi")>; + +def SKLWriteResGroup101 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort0156]> { + let Latency = 7; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,2]; +} +def: InstRW<[SKLWriteResGroup101], (instregex "XADD(8|16|32|64)rm")>; + +def SKLWriteResGroup102 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort23,SKLPort237,SKLPort0156]> { + let Latency = 7; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,1,1]; +} +def: InstRW<[SKLWriteResGroup102], (instregex "CALL(16|32|64)m", + "FARCALL64")>; + +def SKLWriteResGroup103 : SchedWriteRes<[SKLPort6,SKLPort06,SKLPort15,SKLPort0156]> { + let Latency = 7; + let NumMicroOps = 7; + let ResourceCycles = [1,3,1,2]; +} +def: InstRW<[SKLWriteResGroup103], (instrs LOOP)>; + +def SKLWriteResGroup107 : SchedWriteRes<[SKLPort1,SKLPort23]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup107], (instregex "PDEP(32|64)rm", + "PEXT(32|64)rm")>; + +def SKLWriteResGroup107_16 : SchedWriteRes<[SKLPort1, SKLPort0156, SKLPort23]> { + let Latency = 8; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup107_16], (instrs IMUL16rmi, IMUL16rmi8)>; + +def SKLWriteResGroup107_16_2 : SchedWriteRes<[SKLPort1, SKLPort06, SKLPort0156, SKLPort23]> { + let Latency = 9; + let NumMicroOps = 5; + let ResourceCycles = [1,1,2,1]; +} +def: InstRW<[SKLWriteResGroup107_16_2], (instrs IMUL16m, MUL16m)>; + +def SKLWriteResGroup108 : SchedWriteRes<[SKLPort5,SKLPort23]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup108], (instregex "FCOM(P?)(32|64)m", + "VPBROADCASTBYrm", + "VPBROADCASTWYrm", + "VPMOVSXBDYrm", + "VPMOVSXBQYrm", + "VPMOVSXWQYrm")>; + +def SKLWriteResGroup110 : SchedWriteRes<[SKLPort23,SKLPort015]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup110], (instregex "VPADD(B|D|Q|W)Yrm", + "VPBLENDDYrmi", + "VPSUB(B|D|Q|W)Yrm")>; + +def SKLWriteResGroup112 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> { + let Latency = 8; + let NumMicroOps = 4; + let ResourceCycles = [1,2,1]; +} +def: InstRW<[SKLWriteResGroup112], (instregex "MMX_PH(ADD|SUB)SWrm")>; + +def SKLWriteResGroup115 : SchedWriteRes<[SKLPort23,SKLPort237,SKLPort06]> { + let Latency = 8; + let NumMicroOps = 5; + let ResourceCycles = [1,1,3]; +} +def: InstRW<[SKLWriteResGroup115], (instregex "ROR(8|16|32|64)mCL")>; + +def SKLWriteResGroup116 : SchedWriteRes<[SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> { + let Latency = 8; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,2]; +} +def: InstRW<[SKLWriteResGroup116], (instregex "RCL(8|16|32|64)m1", + "RCL(8|16|32|64)mi", + "RCR(8|16|32|64)m1", + "RCR(8|16|32|64)mi")>; + +def SKLWriteResGroup117 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06]> { + let Latency = 8; + let NumMicroOps = 6; + let ResourceCycles = [1,1,1,3]; +} +def: InstRW<[SKLWriteResGroup117], (instregex "ROL(8|16|32|64)mCL", + "SAR(8|16|32|64)mCL", + "SHL(8|16|32|64)mCL", + "SHR(8|16|32|64)mCL")>; + +def SKLWriteResGroup119 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> { + let Latency = 8; + let NumMicroOps = 6; + let ResourceCycles = [1,1,1,2,1]; +} +def: SchedAlias<WriteADCRMW, SKLWriteResGroup119>; +def: InstRW<[SKLWriteResGroup119], (instregex "CMPXCHG(8|16|32|64)rm")>; + +def SKLWriteResGroup120 : SchedWriteRes<[SKLPort0,SKLPort23]> { + let Latency = 9; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup120], (instregex "MMX_CVTPI2PSirm")>; + +def SKLWriteResGroup121 : SchedWriteRes<[SKLPort5,SKLPort23]> { + let Latency = 9; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup121], (instregex "(V?)PCMPGTQrm", + "VPMOVSXBWYrm", + "VPMOVSXDQYrm", + "VPMOVSXWDYrm", + "VPMOVZXWDYrm")>; + +def SKLWriteResGroup123 : SchedWriteRes<[SKLPort23,SKLPort01]> { + let Latency = 9; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup123], (instregex "MMX_CVT(T?)PS2PIirm", + "(V?)CVTPS2PDrm")>; + +def SKLWriteResGroup127 : SchedWriteRes<[SKLPort1,SKLPort5,SKLPort23]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup127], (instrs IMUL64m, MUL64m, MULX64rm)>; + +def SKLWriteResGroup128 : SchedWriteRes<[SKLPort5,SKLPort01,SKLPort23]> { + let Latency = 9; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[SKLWriteResGroup128], (instregex "(V?)PHADDSWrm", + "(V?)PHSUBSWrm")>; + +def SKLWriteResGroup131 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort23,SKLPort0156]> { + let Latency = 9; + let NumMicroOps = 5; + let ResourceCycles = [1,2,1,1]; +} +def: InstRW<[SKLWriteResGroup131], (instregex "LAR(16|32|64)rm", + "LSL(16|32|64)rm")>; + +def SKLWriteResGroup133 : SchedWriteRes<[SKLPort5,SKLPort23]> { + let Latency = 10; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup133], (instregex "(ADD|SUB|SUBR)_F(32|64)m", + "ILD_F(16|32|64)m", + "VPCMPGTQYrm")>; + +def SKLWriteResGroup134 : SchedWriteRes<[SKLPort01,SKLPort23]> { + let Latency = 10; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup134], (instregex "(V?)CVTDQ2PSrm", + "(V?)CVTPS2DQrm", + "(V?)CVTSS2SDrm", + "(V?)CVTTPS2DQrm")>; + +def SKLWriteResGroup138 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> { + let Latency = 10; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup138], (instregex "MMX_CVTPI2PDirm")>; + +def SKLWriteResGroup139 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort01]> { + let Latency = 10; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup139], (instregex "(V?)CVTSD2SSrm")>; + +def SKLWriteResGroup140 : SchedWriteRes<[SKLPort5,SKLPort01,SKLPort23]> { + let Latency = 10; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[SKLWriteResGroup140], (instregex "VPHADDSWYrm", + "VPHSUBSWYrm")>; + +def SKLWriteResGroup142 : SchedWriteRes<[SKLPort1,SKLPort23,SKLPort06,SKLPort0156]> { + let Latency = 9; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKLWriteResGroup142], (instrs IMUL32m, MUL32m, MULX32rm)>; + +def SKLWriteResGroup143 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> { + let Latency = 10; + let NumMicroOps = 8; + let ResourceCycles = [1,1,1,1,1,3]; +} +def: InstRW<[SKLWriteResGroup143], (instregex "XCHG(8|16|32|64)rm")>; + +def SKLWriteResGroup145 : SchedWriteRes<[SKLPort0,SKLFPDivider]> { + let Latency = 11; + let NumMicroOps = 1; + let ResourceCycles = [1,3]; +} +def : SchedAlias<WriteFDivX, SKLWriteResGroup145>; // TODO - convert to ZnWriteResFpuPair + +def SKLWriteResGroup146 : SchedWriteRes<[SKLPort0,SKLPort23]> { + let Latency = 11; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup146], (instregex "MUL_F(32|64)m")>; + +def SKLWriteResGroup147 : SchedWriteRes<[SKLPort01,SKLPort23]> { + let Latency = 11; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup147], (instregex "VCVTDQ2PSYrm", + "VCVTPS2PDYrm", + "VCVT(T?)PS2DQYrm")>; + +def SKLWriteResGroup149 : SchedWriteRes<[SKLPort5,SKLPort23]> { + let Latency = 11; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKLWriteResGroup149], (instregex "FICOM(P?)(16|32)m")>; + +def SKLWriteResGroup150 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> { + let Latency = 11; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup150], (instregex "(V?)CVTDQ2PDrm")>; + +def SKLWriteResGroup151 : SchedWriteRes<[SKLPort0,SKLPort23,SKLPort01]> { + let Latency = 11; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup151], (instregex "(V?)CVTSS2SI64rm", + "(V?)CVT(T?)SD2SI(64)?rm", + "VCVTTSS2SI64rm", + "(V?)CVT(T?)SS2SIrm")>; + +def SKLWriteResGroup152 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort01]> { + let Latency = 11; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup152], (instregex "CVTPD2PSrm", + "CVT(T?)PD2DQrm", + "MMX_CVT(T?)PD2PIirm")>; + +def SKLWriteResGroup154 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort0156]> { + let Latency = 11; + let NumMicroOps = 7; + let ResourceCycles = [2,3,2]; +} +def: InstRW<[SKLWriteResGroup154], (instregex "RCL(16|32|64)rCL", + "RCR(16|32|64)rCL")>; + +def SKLWriteResGroup155 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort15,SKLPort0156]> { + let Latency = 11; + let NumMicroOps = 9; + let ResourceCycles = [1,5,1,2]; +} +def: InstRW<[SKLWriteResGroup155], (instregex "RCL8rCL")>; + +def SKLWriteResGroup156 : SchedWriteRes<[SKLPort06,SKLPort0156]> { + let Latency = 11; + let NumMicroOps = 11; + let ResourceCycles = [2,9]; +} +def: InstRW<[SKLWriteResGroup156], (instrs LOOPE, LOOPNE)>; + +def SKLWriteResGroup160 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23,SKLPort01]> { + let Latency = 12; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKLWriteResGroup160], (instregex "CVTTSS2SI64rm")>; + +def SKLWriteResGroup162 : SchedWriteRes<[SKLPort5,SKLPort23]> { + let Latency = 13; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKLWriteResGroup162], (instregex "(ADD|SUB|SUBR)_FI(16|32)m")>; + +def SKLWriteResGroup163 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> { + let Latency = 13; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup163], (instregex "VCVTDQ2PDYrm")>; + +def SKLWriteResGroup166 : SchedWriteRes<[SKLPort0,SKLFPDivider]> { + let Latency = 14; + let NumMicroOps = 1; + let ResourceCycles = [1,3]; +} +def : SchedAlias<WriteFDiv64, SKLWriteResGroup166>; // TODO - convert to ZnWriteResFpuPair +def : SchedAlias<WriteFDiv64X, SKLWriteResGroup166>; // TODO - convert to ZnWriteResFpuPair + +def SKLWriteResGroup166_1 : SchedWriteRes<[SKLPort0,SKLFPDivider]> { + let Latency = 14; + let NumMicroOps = 1; + let ResourceCycles = [1,5]; +} +def : SchedAlias<WriteFDiv64Y, SKLWriteResGroup166_1>; // TODO - convert to ZnWriteResFpuPair + +def SKLWriteResGroup169 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> { + let Latency = 14; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup169], (instregex "MUL_FI(16|32)m")>; + +def SKLWriteResGroup170 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort15,SKLPort0156]> { + let Latency = 14; + let NumMicroOps = 10; + let ResourceCycles = [2,4,1,3]; +} +def: InstRW<[SKLWriteResGroup170], (instregex "RCR8rCL")>; + +def SKLWriteResGroup171 : SchedWriteRes<[SKLPort0]> { + let Latency = 15; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup171], (instregex "DIVR_(FPrST0|FST0r|FrST0)")>; + +def SKLWriteResGroup174 : SchedWriteRes<[SKLPort1,SKLPort23,SKLPort237,SKLPort06,SKLPort15,SKLPort0156]> { + let Latency = 15; + let NumMicroOps = 10; + let ResourceCycles = [1,1,1,5,1,1]; +} +def: InstRW<[SKLWriteResGroup174], (instregex "RCL(8|16|32|64)mCL")>; + +def SKLWriteResGroup177 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06,SKLPort15,SKLPort0156]> { + let Latency = 16; + let NumMicroOps = 14; + let ResourceCycles = [1,1,1,4,2,5]; +} +def: InstRW<[SKLWriteResGroup177], (instrs CMPXCHG8B)>; + +def SKLWriteResGroup178 : SchedWriteRes<[SKLPort0156]> { + let Latency = 16; + let NumMicroOps = 16; + let ResourceCycles = [16]; +} +def: InstRW<[SKLWriteResGroup178], (instrs VZEROALL)>; + +def SKLWriteResGroup179 : SchedWriteRes<[SKLPort0,SKLPort23,SKLFPDivider]> { + let Latency = 17; + let NumMicroOps = 2; + let ResourceCycles = [1,1,5]; +} +def : SchedAlias<WriteFDivXLd, SKLWriteResGroup179>; // TODO - convert to ZnWriteResFpuPair + +def SKLWriteResGroup180 : SchedWriteRes<[SKLPort0,SKLPort1,SKLPort5,SKLPort6,SKLPort05,SKLPort0156]> { + let Latency = 17; + let NumMicroOps = 15; + let ResourceCycles = [2,1,2,4,2,4]; +} +def: InstRW<[SKLWriteResGroup180], (instrs XCH_F)>; + +def SKLWriteResGroup184 : SchedWriteRes<[SKLPort5,SKLPort6,SKLPort06,SKLPort0156]> { + let Latency = 18; + let NumMicroOps = 8; + let ResourceCycles = [1,1,1,5]; +} +def: InstRW<[SKLWriteResGroup184], (instrs CPUID, RDTSC)>; + +def SKLWriteResGroup185 : SchedWriteRes<[SKLPort1,SKLPort23,SKLPort237,SKLPort06,SKLPort15,SKLPort0156]> { + let Latency = 18; + let NumMicroOps = 11; + let ResourceCycles = [2,1,1,4,1,2]; +} +def: InstRW<[SKLWriteResGroup185], (instregex "RCR(8|16|32|64)mCL")>; + +def SKLWriteResGroup186 : SchedWriteRes<[SKLPort0,SKLPort23,SKLFPDivider]> { + let Latency = 19; + let NumMicroOps = 2; + let ResourceCycles = [1,1,4]; +} +def : SchedAlias<WriteFDiv64Ld, SKLWriteResGroup186>; // TODO - convert to ZnWriteResFpuPair + +def SKLWriteResGroup189 : SchedWriteRes<[SKLPort0]> { + let Latency = 20; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup189], (instregex "DIV_(FPrST0|FST0r|FrST0)")>; + +def SKLWriteResGroup190 : SchedWriteRes<[SKLPort0,SKLPort23,SKLFPDivider]> { + let Latency = 20; + let NumMicroOps = 2; + let ResourceCycles = [1,1,4]; +} +def : SchedAlias<WriteFDiv64XLd, SKLWriteResGroup190>; // TODO - convert to ZnWriteResFpuPair + +def SKLWriteResGroup192 : SchedWriteRes<[SKLPort4,SKLPort5,SKLPort6,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> { + let Latency = 20; + let NumMicroOps = 8; + let ResourceCycles = [1,1,1,1,1,1,2]; +} +def: InstRW<[SKLWriteResGroup192], (instrs INSB, INSL, INSW)>; + +def SKLWriteResGroup193 : SchedWriteRes<[SKLPort5,SKLPort6,SKLPort0156]> { + let Latency = 20; + let NumMicroOps = 10; + let ResourceCycles = [1,2,7]; +} +def: InstRW<[SKLWriteResGroup193], (instrs MWAITrr)>; + +def SKLWriteResGroup195 : SchedWriteRes<[SKLPort0,SKLPort23,SKLFPDivider]> { + let Latency = 21; + let NumMicroOps = 2; + let ResourceCycles = [1,1,8]; +} +def : SchedAlias<WriteFDiv64YLd, SKLWriteResGroup195>; // TODO - convert to ZnWriteResFpuPair + +def SKLWriteResGroup196 : SchedWriteRes<[SKLPort0,SKLPort23]> { + let Latency = 22; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup196], (instregex "DIV_F(32|64)m")>; + +def SKLWriteResGroup196_1 : SchedWriteRes<[SKLPort0, SKLPort23, SKLPort5, SKLPort015]> { + let Latency = 22; + let NumMicroOps = 5; + let ResourceCycles = [1,2,1,1]; +} +def: InstRW<[SKLWriteResGroup196_1], (instrs VGATHERDPSrm, + VGATHERDPDrm, + VGATHERQPDrm, + VGATHERQPSrm, + VPGATHERDDrm, + VPGATHERDQrm, + VPGATHERQDrm, + VPGATHERQQrm)>; + +def SKLWriteResGroup196_2 : SchedWriteRes<[SKLPort0, SKLPort23, SKLPort5, SKLPort015]> { + let Latency = 25; + let NumMicroOps = 5; + let ResourceCycles = [1,2,1,1]; +} +def: InstRW<[SKLWriteResGroup196_2], (instrs VGATHERDPSYrm, + VGATHERQPDYrm, + VGATHERQPSYrm, + VPGATHERDDYrm, + VPGATHERDQYrm, + VPGATHERQDYrm, + VPGATHERQQYrm, + VGATHERDPDYrm)>; + +def SKLWriteResGroup198 : SchedWriteRes<[SKLPort0,SKLPort4,SKLPort5,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> { + let Latency = 23; + let NumMicroOps = 19; + let ResourceCycles = [2,1,4,1,1,4,6]; +} +def: InstRW<[SKLWriteResGroup198], (instrs CMPXCHG16B)>; + +def SKLWriteResGroup202 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> { + let Latency = 25; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup202], (instregex "DIV_FI(16|32)m")>; + +def SKLWriteResGroup206 : SchedWriteRes<[SKLPort0,SKLPort23]> { + let Latency = 27; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup206], (instregex "DIVR_F(32|64)m")>; + +def SKLWriteResGroup207 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23,SKLPort0156]> { + let Latency = 28; + let NumMicroOps = 8; + let ResourceCycles = [2,4,1,1]; +} +def: InstRW<[SKLWriteResGroup207], (instregex "IDIV(8|16|32|64)m")>; + +def SKLWriteResGroup208 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> { + let Latency = 30; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup208], (instregex "DIVR_FI(16|32)m")>; + +def SKLWriteResGroup209 : SchedWriteRes<[SKLPort5,SKLPort6,SKLPort23,SKLPort06,SKLPort0156]> { + let Latency = 35; + let NumMicroOps = 23; + let ResourceCycles = [1,5,3,4,10]; +} +def: InstRW<[SKLWriteResGroup209], (instregex "IN(8|16|32)ri", + "IN(8|16|32)rr")>; + +def SKLWriteResGroup210 : SchedWriteRes<[SKLPort5,SKLPort6,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> { + let Latency = 35; + let NumMicroOps = 23; + let ResourceCycles = [1,5,2,1,4,10]; +} +def: InstRW<[SKLWriteResGroup210], (instregex "OUT(8|16|32)ir", + "OUT(8|16|32)rr")>; + +def SKLWriteResGroup211 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort23,SKLPort0156]> { + let Latency = 37; + let NumMicroOps = 31; + let ResourceCycles = [1,8,1,21]; +} +def: InstRW<[SKLWriteResGroup211], (instregex "XRSTOR(64)?")>; + +def SKLWriteResGroup212 : SchedWriteRes<[SKLPort1,SKLPort4,SKLPort5,SKLPort6,SKLPort23,SKLPort237,SKLPort15,SKLPort0156]> { + let Latency = 40; + let NumMicroOps = 18; + let ResourceCycles = [1,1,2,3,1,1,1,8]; +} +def: InstRW<[SKLWriteResGroup212], (instrs VMCLEARm)>; + +def SKLWriteResGroup213 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort23,SKLPort237,SKLPort0156]> { + let Latency = 41; + let NumMicroOps = 39; + let ResourceCycles = [1,10,1,1,26]; +} +def: InstRW<[SKLWriteResGroup213], (instrs XSAVE64)>; + +def SKLWriteResGroup214 : SchedWriteRes<[SKLPort5,SKLPort0156]> { + let Latency = 42; + let NumMicroOps = 22; + let ResourceCycles = [2,20]; +} +def: InstRW<[SKLWriteResGroup214], (instrs RDTSCP)>; + +def SKLWriteResGroup215 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort23,SKLPort237,SKLPort0156]> { + let Latency = 42; + let NumMicroOps = 40; + let ResourceCycles = [1,11,1,1,26]; +} +def: InstRW<[SKLWriteResGroup215], (instrs XSAVE)>; +def: InstRW<[SKLWriteResGroup215], (instregex "XSAVEC", "XSAVES")>; + +def SKLWriteResGroup216 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort23,SKLPort237,SKLPort0156]> { + let Latency = 46; + let NumMicroOps = 44; + let ResourceCycles = [1,11,1,1,30]; +} +def: InstRW<[SKLWriteResGroup216], (instregex "XSAVEOPT")>; + +def SKLWriteResGroup217 : SchedWriteRes<[SKLPort0,SKLPort23,SKLPort05,SKLPort06,SKLPort0156]> { + let Latency = 62; + let NumMicroOps = 64; + let ResourceCycles = [2,8,5,10,39]; +} +def: InstRW<[SKLWriteResGroup217], (instrs FLDENVm)>; + +def SKLWriteResGroup218 : SchedWriteRes<[SKLPort0,SKLPort6,SKLPort23,SKLPort05,SKLPort06,SKLPort15,SKLPort0156]> { + let Latency = 63; + let NumMicroOps = 88; + let ResourceCycles = [4,4,31,1,2,1,45]; +} +def: InstRW<[SKLWriteResGroup218], (instrs FXRSTOR64)>; + +def SKLWriteResGroup219 : SchedWriteRes<[SKLPort0,SKLPort6,SKLPort23,SKLPort05,SKLPort06,SKLPort15,SKLPort0156]> { + let Latency = 63; + let NumMicroOps = 90; + let ResourceCycles = [4,2,33,1,2,1,47]; +} +def: InstRW<[SKLWriteResGroup219], (instrs FXRSTOR)>; + +def SKLWriteResGroup220 : SchedWriteRes<[SKLPort5,SKLPort05,SKLPort0156]> { + let Latency = 75; + let NumMicroOps = 15; + let ResourceCycles = [6,3,6]; +} +def: InstRW<[SKLWriteResGroup220], (instrs FNINIT)>; + +def SKLWriteResGroup221 : SchedWriteRes<[SKLPort0,SKLPort1,SKLPort5,SKLPort6,SKLPort05,SKLPort0156]> { + let Latency = 76; + let NumMicroOps = 32; + let ResourceCycles = [7,2,8,3,1,11]; +} +def: InstRW<[SKLWriteResGroup221], (instregex "DIV(16|32|64)r")>; + +def SKLWriteResGroup222 : SchedWriteRes<[SKLPort0,SKLPort1,SKLPort5,SKLPort6,SKLPort06,SKLPort0156]> { + let Latency = 102; + let NumMicroOps = 66; + let ResourceCycles = [4,2,4,8,14,34]; +} +def: InstRW<[SKLWriteResGroup222], (instregex "IDIV(16|32|64)r")>; + +def SKLWriteResGroup223 : SchedWriteRes<[SKLPort0,SKLPort1,SKLPort4,SKLPort5,SKLPort6,SKLPort237,SKLPort06,SKLPort0156]> { + let Latency = 106; + let NumMicroOps = 100; + let ResourceCycles = [9,1,11,16,1,11,21,30]; +} +def: InstRW<[SKLWriteResGroup223], (instrs FSTENVm)>; + +def: InstRW<[WriteZero], (instrs CLC)>; + +} // SchedModel diff --git a/capstone/suite/synctools/tablegen/X86/back/X86SchedSkylakeServer.td b/capstone/suite/synctools/tablegen/X86/back/X86SchedSkylakeServer.td new file mode 100755 index 000000000..9d5f8555c --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86SchedSkylakeServer.td @@ -0,0 +1,2580 @@ +//=- X86SchedSkylake.td - X86 Skylake Server Scheduling ------*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for Skylake Server to support +// instruction scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +def SkylakeServerModel : SchedMachineModel { + // All x86 instructions are modeled as a single micro-op, and SKylake can + // decode 6 instructions per cycle. + let IssueWidth = 6; + let MicroOpBufferSize = 224; // Based on the reorder buffer. + let LoadLatency = 5; + let MispredictPenalty = 14; + + // Based on the LSD (loop-stream detector) queue size and benchmarking data. + let LoopMicroOpBufferSize = 50; + + // This flag is set to allow the scheduler to assign a default model to + // unrecognized opcodes. + let CompleteModel = 0; +} + +let SchedModel = SkylakeServerModel in { + +// Skylake Server can issue micro-ops to 8 different ports in one cycle. + +// Ports 0, 1, 5, and 6 handle all computation. +// Port 4 gets the data half of stores. Store data can be available later than +// the store address, but since we don't model the latency of stores, we can +// ignore that. +// Ports 2 and 3 are identical. They handle loads and the address half of +// stores. Port 7 can handle address calculations. +def SKXPort0 : ProcResource<1>; +def SKXPort1 : ProcResource<1>; +def SKXPort2 : ProcResource<1>; +def SKXPort3 : ProcResource<1>; +def SKXPort4 : ProcResource<1>; +def SKXPort5 : ProcResource<1>; +def SKXPort6 : ProcResource<1>; +def SKXPort7 : ProcResource<1>; + +// Many micro-ops are capable of issuing on multiple ports. +def SKXPort01 : ProcResGroup<[SKXPort0, SKXPort1]>; +def SKXPort23 : ProcResGroup<[SKXPort2, SKXPort3]>; +def SKXPort237 : ProcResGroup<[SKXPort2, SKXPort3, SKXPort7]>; +def SKXPort04 : ProcResGroup<[SKXPort0, SKXPort4]>; +def SKXPort05 : ProcResGroup<[SKXPort0, SKXPort5]>; +def SKXPort06 : ProcResGroup<[SKXPort0, SKXPort6]>; +def SKXPort15 : ProcResGroup<[SKXPort1, SKXPort5]>; +def SKXPort16 : ProcResGroup<[SKXPort1, SKXPort6]>; +def SKXPort56 : ProcResGroup<[SKXPort5, SKXPort6]>; +def SKXPort015 : ProcResGroup<[SKXPort0, SKXPort1, SKXPort5]>; +def SKXPort056 : ProcResGroup<[SKXPort0, SKXPort5, SKXPort6]>; +def SKXPort0156: ProcResGroup<[SKXPort0, SKXPort1, SKXPort5, SKXPort6]>; + +def SKXDivider : ProcResource<1>; // Integer division issued on port 0. +// FP division and sqrt on port 0. +def SKXFPDivider : ProcResource<1>; + +// 60 Entry Unified Scheduler +def SKXPortAny : ProcResGroup<[SKXPort0, SKXPort1, SKXPort2, SKXPort3, SKXPort4, + SKXPort5, SKXPort6, SKXPort7]> { + let BufferSize=60; +} + +// Loads are 5 cycles, so ReadAfterLd registers needn't be available until 5 +// cycles after the memory operand. +def : ReadAdvance<ReadAfterLd, 5>; + +// Many SchedWrites are defined in pairs with and without a folded load. +// Instructions with folded loads are usually micro-fused, so they only appear +// as two micro-ops when queued in the reservation station. +// This multiclass defines the resource usage for variants with and without +// folded loads. +multiclass SKXWriteResPair<X86FoldableSchedWrite SchedRW, + list<ProcResourceKind> ExePorts, + int Lat, list<int> Res = [1], int UOps = 1, + int LoadLat = 5> { + // Register variant is using a single cycle on ExePort. + def : WriteRes<SchedRW, ExePorts> { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } + + // Memory variant also uses a cycle on port 2/3 and adds LoadLat cycles to + // the latency (default = 5). + def : WriteRes<SchedRW.Folded, !listconcat([SKXPort23], ExePorts)> { + let Latency = !add(Lat, LoadLat); + let ResourceCycles = !listconcat([1], Res); + let NumMicroOps = !add(UOps, 1); + } +} + +// A folded store needs a cycle on port 4 for the store data, and an extra port +// 2/3/7 cycle to recompute the address. +def : WriteRes<WriteRMW, [SKXPort237,SKXPort4]>; + +// Arithmetic. +defm : SKXWriteResPair<WriteALU, [SKXPort0156], 1>; // Simple integer ALU op. +defm : SKXWriteResPair<WriteADC, [SKXPort06], 1>; // Integer ALU + flags op. +defm : SKXWriteResPair<WriteIMul, [SKXPort1], 3>; // Integer multiplication. +defm : SKXWriteResPair<WriteIMul64, [SKXPort1], 3>; // Integer 64-bit multiplication. + +defm : X86WriteRes<WriteBSWAP32, [SKXPort15], 1, [1], 1>; +defm : X86WriteRes<WriteBSWAP64, [SKXPort06, SKXPort15], 2, [1,1], 2>; + +defm : SKXWriteResPair<WriteDiv8, [SKXPort0, SKXDivider], 25, [1,10], 1, 4>; +defm : SKXWriteResPair<WriteDiv16, [SKXPort0, SKXDivider], 25, [1,10], 1, 4>; +defm : SKXWriteResPair<WriteDiv32, [SKXPort0, SKXDivider], 25, [1,10], 1, 4>; +defm : SKXWriteResPair<WriteDiv64, [SKXPort0, SKXDivider], 25, [1,10], 1, 4>; +defm : SKXWriteResPair<WriteIDiv8, [SKXPort0, SKXDivider], 25, [1,10], 1, 4>; +defm : SKXWriteResPair<WriteIDiv16, [SKXPort0, SKXDivider], 25, [1,10], 1, 4>; +defm : SKXWriteResPair<WriteIDiv32, [SKXPort0, SKXDivider], 25, [1,10], 1, 4>; +defm : SKXWriteResPair<WriteIDiv64, [SKXPort0, SKXDivider], 25, [1,10], 1, 4>; + +defm : SKXWriteResPair<WriteCRC32, [SKXPort1], 3>; + +def : WriteRes<WriteIMulH, []> { let Latency = 3; } // Integer multiplication, high part. +def : WriteRes<WriteLEA, [SKXPort15]>; // LEA instructions can't fold loads. + +defm : SKXWriteResPair<WriteCMOV, [SKXPort06], 1, [1], 1>; // Conditional move. +defm : SKXWriteResPair<WriteCMOV2, [SKXPort06], 2, [2], 2>; // Conditional (CF + ZF flag) move. +defm : X86WriteRes<WriteFCMOV, [SKXPort1], 3, [1], 1>; // x87 conditional move. +def : WriteRes<WriteSETCC, [SKXPort06]>; // Setcc. +def : WriteRes<WriteSETCCStore, [SKXPort06,SKXPort4,SKXPort237]> { + let Latency = 2; + let NumMicroOps = 3; +} +def : WriteRes<WriteLAHFSAHF, [SKXPort06]>; +def : WriteRes<WriteBitTest,[SKXPort06]>; // + +// Integer shifts and rotates. +defm : SKXWriteResPair<WriteShift, [SKXPort06], 1>; + +// SHLD/SHRD. +defm : X86WriteRes<WriteSHDrri, [SKXPort1], 3, [1], 1>; +defm : X86WriteRes<WriteSHDrrcl,[SKXPort1,SKXPort06,SKXPort0156], 6, [1, 2, 1], 4>; +defm : X86WriteRes<WriteSHDmri, [SKXPort1,SKXPort23,SKXPort237,SKXPort0156], 9, [1, 1, 1, 1], 4>; +defm : X86WriteRes<WriteSHDmrcl,[SKXPort1,SKXPort23,SKXPort237,SKXPort06,SKXPort0156], 11, [1, 1, 1, 2, 1], 6>; + +// Bit counts. +defm : SKXWriteResPair<WriteBSF, [SKXPort1], 3>; +defm : SKXWriteResPair<WriteBSR, [SKXPort1], 3>; +defm : SKXWriteResPair<WriteLZCNT, [SKXPort1], 3>; +defm : SKXWriteResPair<WriteTZCNT, [SKXPort1], 3>; +defm : SKXWriteResPair<WritePOPCNT, [SKXPort1], 3>; + +// BMI1 BEXTR, BMI2 BZHI +defm : SKXWriteResPair<WriteBEXTR, [SKXPort06,SKXPort15], 2, [1,1], 2>; +defm : SKXWriteResPair<WriteBZHI, [SKXPort15], 1>; + +// Loads, stores, and moves, not folded with other operations. +defm : X86WriteRes<WriteLoad, [SKXPort23], 5, [1], 1>; +defm : X86WriteRes<WriteStore, [SKXPort237, SKXPort4], 1, [1,1], 1>; +defm : X86WriteRes<WriteStoreNT, [SKXPort237, SKXPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteMove, [SKXPort0156], 1, [1], 1>; + +// Idioms that clear a register, like xorps %xmm0, %xmm0. +// These can often bypass execution ports completely. +def : WriteRes<WriteZero, []>; + +// Branches don't produce values, so they have no latency, but they still +// consume resources. Indirect branches can fold loads. +defm : SKXWriteResPair<WriteJump, [SKXPort06], 1>; + +// Floating point. This covers both scalar and vector operations. +defm : X86WriteRes<WriteFLD0, [SKXPort05], 1, [1], 1>; +defm : X86WriteRes<WriteFLD1, [SKXPort05], 1, [2], 2>; +defm : X86WriteRes<WriteFLDC, [SKXPort05], 1, [2], 2>; +defm : X86WriteRes<WriteFLoad, [SKXPort23], 5, [1], 1>; +defm : X86WriteRes<WriteFLoadX, [SKXPort23], 6, [1], 1>; +defm : X86WriteRes<WriteFLoadY, [SKXPort23], 7, [1], 1>; +defm : X86WriteRes<WriteFMaskedLoad, [SKXPort23,SKXPort015], 7, [1,1], 2>; +defm : X86WriteRes<WriteFMaskedLoadY, [SKXPort23,SKXPort015], 8, [1,1], 2>; +defm : X86WriteRes<WriteFStore, [SKXPort237,SKXPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteFStoreX, [SKXPort237,SKXPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteFStoreY, [SKXPort237,SKXPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteFStoreNT, [SKXPort237,SKXPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteFStoreNTX, [SKXPort237,SKXPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteFStoreNTY, [SKXPort237,SKXPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteFMaskedStore, [SKXPort237,SKXPort0], 2, [1,1], 2>; +defm : X86WriteRes<WriteFMaskedStoreY, [SKXPort237,SKXPort0], 2, [1,1], 2>; +defm : X86WriteRes<WriteFMove, [SKXPort015], 1, [1], 1>; +defm : X86WriteRes<WriteFMoveX, [SKXPort015], 1, [1], 1>; +defm : X86WriteRes<WriteFMoveY, [SKXPort015], 1, [1], 1>; +defm : X86WriteRes<WriteEMMS, [SKXPort05,SKXPort0156], 10, [9,1], 10>; + +defm : SKXWriteResPair<WriteFAdd, [SKXPort01], 4, [1], 1, 5>; // Floating point add/sub. +defm : SKXWriteResPair<WriteFAddX, [SKXPort01], 4, [1], 1, 6>; +defm : SKXWriteResPair<WriteFAddY, [SKXPort01], 4, [1], 1, 7>; +defm : SKXWriteResPair<WriteFAddZ, [SKXPort05], 4, [1], 1, 7>; +defm : SKXWriteResPair<WriteFAdd64, [SKXPort01], 4, [1], 1, 5>; // Floating point double add/sub. +defm : SKXWriteResPair<WriteFAdd64X, [SKXPort01], 4, [1], 1, 6>; +defm : SKXWriteResPair<WriteFAdd64Y, [SKXPort01], 4, [1], 1, 7>; +defm : SKXWriteResPair<WriteFAdd64Z, [SKXPort05], 4, [1], 1, 7>; + +defm : SKXWriteResPair<WriteFCmp, [SKXPort01], 4, [1], 1, 5>; // Floating point compare. +defm : SKXWriteResPair<WriteFCmpX, [SKXPort01], 4, [1], 1, 6>; +defm : SKXWriteResPair<WriteFCmpY, [SKXPort01], 4, [1], 1, 7>; +defm : SKXWriteResPair<WriteFCmpZ, [SKXPort05], 4, [1], 1, 7>; +defm : SKXWriteResPair<WriteFCmp64, [SKXPort01], 4, [1], 1, 5>; // Floating point double compare. +defm : SKXWriteResPair<WriteFCmp64X, [SKXPort01], 4, [1], 1, 6>; +defm : SKXWriteResPair<WriteFCmp64Y, [SKXPort01], 4, [1], 1, 7>; +defm : SKXWriteResPair<WriteFCmp64Z, [SKXPort05], 4, [1], 1, 7>; + +defm : SKXWriteResPair<WriteFCom, [SKXPort0], 2>; // Floating point compare to flags. + +defm : SKXWriteResPair<WriteFMul, [SKXPort01], 4, [1], 1, 5>; // Floating point multiplication. +defm : SKXWriteResPair<WriteFMulX, [SKXPort01], 4, [1], 1, 6>; +defm : SKXWriteResPair<WriteFMulY, [SKXPort01], 4, [1], 1, 7>; +defm : SKXWriteResPair<WriteFMulZ, [SKXPort05], 4, [1], 1, 7>; +defm : SKXWriteResPair<WriteFMul64, [SKXPort01], 4, [1], 1, 5>; // Floating point double multiplication. +defm : SKXWriteResPair<WriteFMul64X, [SKXPort01], 4, [1], 1, 6>; +defm : SKXWriteResPair<WriteFMul64Y, [SKXPort01], 4, [1], 1, 7>; +defm : SKXWriteResPair<WriteFMul64Z, [SKXPort05], 4, [1], 1, 7>; + +defm : SKXWriteResPair<WriteFDiv, [SKXPort0,SKXFPDivider], 11, [1,3], 1, 5>; // 10-14 cycles. // Floating point division. +//defm : SKXWriteResPair<WriteFDivX, [SKXPort0,SKXFPDivider], 11, [1,3], 1, 6>; // 10-14 cycles. +defm : SKXWriteResPair<WriteFDivY, [SKXPort0,SKXFPDivider], 11, [1,5], 1, 7>; // 10-14 cycles. +defm : SKXWriteResPair<WriteFDivZ, [SKXPort0,SKXPort5,SKXFPDivider], 18, [2,1,10], 3, 7>; // 10-14 cycles. +//defm : SKXWriteResPair<WriteFDiv64, [SKXPort0,SKXFPDivider], 14, [1,3], 1, 5>; // 10-14 cycles. // Floating point division. +//defm : SKXWriteResPair<WriteFDiv64X, [SKXPort0,SKXFPDivider], 14, [1,3], 1, 6>; // 10-14 cycles. +//defm : SKXWriteResPair<WriteFDiv64Y, [SKXPort0,SKXFPDivider], 14, [1,5], 1, 7>; // 10-14 cycles. +defm : SKXWriteResPair<WriteFDiv64Z, [SKXPort0,SKXPort5,SKXFPDivider], 23, [2,1,16], 3, 7>; // 10-14 cycles. + +defm : SKXWriteResPair<WriteFSqrt, [SKXPort0,SKXFPDivider], 12, [1,3], 1, 5>; // Floating point square root. +defm : SKXWriteResPair<WriteFSqrtX, [SKXPort0,SKXFPDivider], 12, [1,3], 1, 6>; +defm : SKXWriteResPair<WriteFSqrtY, [SKXPort0,SKXFPDivider], 12, [1,6], 1, 7>; +defm : SKXWriteResPair<WriteFSqrtZ, [SKXPort0,SKXPort5,SKXFPDivider], 20, [2,1,12], 3, 7>; +defm : SKXWriteResPair<WriteFSqrt64, [SKXPort0,SKXFPDivider], 18, [1,6], 1, 5>; // Floating point double square root. +defm : SKXWriteResPair<WriteFSqrt64X, [SKXPort0,SKXFPDivider], 18, [1,6], 1, 6>; +defm : SKXWriteResPair<WriteFSqrt64Y, [SKXPort0,SKXFPDivider], 18, [1,12],1, 7>; +defm : SKXWriteResPair<WriteFSqrt64Z, [SKXPort0,SKXPort5,SKXFPDivider], 32, [2,1,24], 3, 7>; +defm : SKXWriteResPair<WriteFSqrt80, [SKXPort0,SKXFPDivider], 21, [1,7]>; // Floating point long double square root. + +defm : SKXWriteResPair<WriteFRcp, [SKXPort0], 4, [1], 1, 5>; // Floating point reciprocal estimate. +defm : SKXWriteResPair<WriteFRcpX, [SKXPort0], 4, [1], 1, 6>; +defm : SKXWriteResPair<WriteFRcpY, [SKXPort0], 4, [1], 1, 7>; +defm : SKXWriteResPair<WriteFRcpZ, [SKXPort0,SKXPort5], 4, [2,1], 3, 7>; + +defm : SKXWriteResPair<WriteFRsqrt, [SKXPort0], 4, [1], 1, 5>; // Floating point reciprocal square root estimate. +defm : SKXWriteResPair<WriteFRsqrtX,[SKXPort0], 4, [1], 1, 6>; +defm : SKXWriteResPair<WriteFRsqrtY,[SKXPort0], 4, [1], 1, 7>; +defm : SKXWriteResPair<WriteFRsqrtZ,[SKXPort0,SKXPort5], 9, [2,1], 3, 7>; + +defm : SKXWriteResPair<WriteFMA, [SKXPort01], 4, [1], 1, 5>; // Fused Multiply Add. +defm : SKXWriteResPair<WriteFMAX, [SKXPort01], 4, [1], 1, 6>; +defm : SKXWriteResPair<WriteFMAY, [SKXPort01], 4, [1], 1, 7>; +defm : SKXWriteResPair<WriteFMAZ, [SKXPort05], 4, [1], 1, 7>; +defm : SKXWriteResPair<WriteDPPD, [SKXPort5,SKXPort015], 9, [1,2], 3, 6>; // Floating point double dot product. +defm : SKXWriteResPair<WriteDPPS, [SKXPort5,SKXPort015], 13, [1,3], 4, 6>; +defm : SKXWriteResPair<WriteDPPSY,[SKXPort5,SKXPort015], 13, [1,3], 4, 7>; +defm : SKXWriteResPair<WriteDPPSZ,[SKXPort5,SKXPort015], 13, [1,3], 4, 7>; +defm : SKXWriteResPair<WriteFSign, [SKXPort0], 1>; // Floating point fabs/fchs. +defm : SKXWriteResPair<WriteFRnd, [SKXPort01], 8, [2], 2, 6>; // Floating point rounding. +defm : SKXWriteResPair<WriteFRndY, [SKXPort01], 8, [2], 2, 7>; +defm : SKXWriteResPair<WriteFRndZ, [SKXPort05], 8, [2], 2, 7>; +defm : SKXWriteResPair<WriteFLogic, [SKXPort015], 1, [1], 1, 6>; // Floating point and/or/xor logicals. +defm : SKXWriteResPair<WriteFLogicY, [SKXPort015], 1, [1], 1, 7>; +defm : SKXWriteResPair<WriteFLogicZ, [SKXPort05], 1, [1], 1, 7>; +defm : SKXWriteResPair<WriteFTest, [SKXPort0], 2, [1], 1, 6>; // Floating point TEST instructions. +defm : SKXWriteResPair<WriteFTestY, [SKXPort0], 2, [1], 1, 7>; +defm : SKXWriteResPair<WriteFTestZ, [SKXPort0], 2, [1], 1, 7>; +defm : SKXWriteResPair<WriteFShuffle, [SKXPort5], 1, [1], 1, 6>; // Floating point vector shuffles. +defm : SKXWriteResPair<WriteFShuffleY, [SKXPort5], 1, [1], 1, 7>; +defm : SKXWriteResPair<WriteFShuffleZ, [SKXPort5], 1, [1], 1, 7>; +defm : SKXWriteResPair<WriteFVarShuffle, [SKXPort5], 1, [1], 1, 6>; // Floating point vector variable shuffles. +defm : SKXWriteResPair<WriteFVarShuffleY, [SKXPort5], 1, [1], 1, 7>; +defm : SKXWriteResPair<WriteFVarShuffleZ, [SKXPort5], 1, [1], 1, 7>; +defm : SKXWriteResPair<WriteFBlend, [SKXPort015], 1, [1], 1, 6>; // Floating point vector blends. +defm : SKXWriteResPair<WriteFBlendY,[SKXPort015], 1, [1], 1, 7>; +defm : SKXWriteResPair<WriteFBlendZ,[SKXPort015], 1, [1], 1, 7>; +defm : SKXWriteResPair<WriteFVarBlend, [SKXPort015], 2, [2], 2, 6>; // Fp vector variable blends. +defm : SKXWriteResPair<WriteFVarBlendY,[SKXPort015], 2, [2], 2, 7>; +defm : SKXWriteResPair<WriteFVarBlendZ,[SKXPort015], 2, [2], 2, 7>; + +// FMA Scheduling helper class. +// class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; } + +// Vector integer operations. +defm : X86WriteRes<WriteVecLoad, [SKXPort23], 5, [1], 1>; +defm : X86WriteRes<WriteVecLoadX, [SKXPort23], 6, [1], 1>; +defm : X86WriteRes<WriteVecLoadY, [SKXPort23], 7, [1], 1>; +defm : X86WriteRes<WriteVecLoadNT, [SKXPort23], 6, [1], 1>; +defm : X86WriteRes<WriteVecLoadNTY, [SKXPort23], 7, [1], 1>; +defm : X86WriteRes<WriteVecMaskedLoad, [SKXPort23,SKXPort015], 7, [1,1], 2>; +defm : X86WriteRes<WriteVecMaskedLoadY, [SKXPort23,SKXPort015], 8, [1,1], 2>; +defm : X86WriteRes<WriteVecStore, [SKXPort237,SKXPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteVecStoreX, [SKXPort237,SKXPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteVecStoreY, [SKXPort237,SKXPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteVecStoreNT, [SKXPort237,SKXPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteVecStoreNTY, [SKXPort237,SKXPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteVecMaskedStore, [SKXPort237,SKXPort0], 2, [1,1], 2>; +defm : X86WriteRes<WriteVecMaskedStoreY, [SKXPort237,SKXPort0], 2, [1,1], 2>; +defm : X86WriteRes<WriteVecMove, [SKXPort05], 1, [1], 1>; +defm : X86WriteRes<WriteVecMoveX, [SKXPort015], 1, [1], 1>; +defm : X86WriteRes<WriteVecMoveY, [SKXPort015], 1, [1], 1>; +defm : X86WriteRes<WriteVecMoveToGpr, [SKXPort0], 2, [1], 1>; +defm : X86WriteRes<WriteVecMoveFromGpr, [SKXPort5], 1, [1], 1>; + +defm : SKXWriteResPair<WriteVecALU, [SKXPort05], 1, [1], 1, 5>; // Vector integer ALU op, no logicals. +defm : SKXWriteResPair<WriteVecALUX, [SKXPort01], 1, [1], 1, 6>; +defm : SKXWriteResPair<WriteVecALUY, [SKXPort01], 1, [1], 1, 7>; +defm : SKXWriteResPair<WriteVecALUZ, [SKXPort0], 1, [1], 1, 7>; +defm : SKXWriteResPair<WriteVecLogic, [SKXPort05], 1, [1], 1, 5>; // Vector integer and/or/xor. +defm : SKXWriteResPair<WriteVecLogicX,[SKXPort015], 1, [1], 1, 6>; +defm : SKXWriteResPair<WriteVecLogicY,[SKXPort015], 1, [1], 1, 7>; +defm : SKXWriteResPair<WriteVecLogicZ,[SKXPort05], 1, [1], 1, 7>; +defm : SKXWriteResPair<WriteVecTest, [SKXPort0,SKXPort5], 3, [1,1], 2, 6>; // Vector integer TEST instructions. +defm : SKXWriteResPair<WriteVecTestY, [SKXPort0,SKXPort5], 3, [1,1], 2, 7>; +defm : SKXWriteResPair<WriteVecTestZ, [SKXPort0,SKXPort5], 3, [1,1], 2, 7>; +defm : SKXWriteResPair<WriteVecIMul, [SKXPort0], 4, [1], 1, 5>; // Vector integer multiply. +defm : SKXWriteResPair<WriteVecIMulX, [SKXPort01], 4, [1], 1, 6>; +defm : SKXWriteResPair<WriteVecIMulY, [SKXPort01], 4, [1], 1, 7>; +defm : SKXWriteResPair<WriteVecIMulZ, [SKXPort05], 4, [1], 1, 7>; +defm : SKXWriteResPair<WritePMULLD, [SKXPort01], 10, [2], 2, 6>; // Vector PMULLD. +defm : SKXWriteResPair<WritePMULLDY, [SKXPort01], 10, [2], 2, 7>; +defm : SKXWriteResPair<WritePMULLDZ, [SKXPort05], 10, [2], 2, 7>; +defm : SKXWriteResPair<WriteShuffle, [SKXPort5], 1, [1], 1, 5>; // Vector shuffles. +defm : SKXWriteResPair<WriteShuffleX, [SKXPort5], 1, [1], 1, 6>; +defm : SKXWriteResPair<WriteShuffleY, [SKXPort5], 1, [1], 1, 7>; +defm : SKXWriteResPair<WriteShuffleZ, [SKXPort5], 1, [1], 1, 7>; +defm : SKXWriteResPair<WriteVarShuffle, [SKXPort5], 1, [1], 1, 5>; // Vector variable shuffles. +defm : SKXWriteResPair<WriteVarShuffleX, [SKXPort5], 1, [1], 1, 6>; +defm : SKXWriteResPair<WriteVarShuffleY, [SKXPort5], 1, [1], 1, 7>; +defm : SKXWriteResPair<WriteVarShuffleZ, [SKXPort5], 1, [1], 1, 7>; +defm : SKXWriteResPair<WriteBlend, [SKXPort5], 1, [1], 1, 6>; // Vector blends. +defm : SKXWriteResPair<WriteBlendY,[SKXPort5], 1, [1], 1, 7>; +defm : SKXWriteResPair<WriteBlendZ,[SKXPort5], 1, [1], 1, 7>; +defm : SKXWriteResPair<WriteVarBlend, [SKXPort015], 2, [2], 2, 6>; // Vector variable blends. +defm : SKXWriteResPair<WriteVarBlendY,[SKXPort015], 2, [2], 2, 6>; +defm : SKXWriteResPair<WriteVarBlendZ,[SKXPort05], 2, [1], 1, 6>; +defm : SKXWriteResPair<WriteMPSAD, [SKXPort5], 4, [2], 2, 6>; // Vector MPSAD. +defm : SKXWriteResPair<WriteMPSADY, [SKXPort5], 4, [2], 2, 7>; +defm : SKXWriteResPair<WriteMPSADZ, [SKXPort5], 4, [2], 2, 7>; +defm : SKXWriteResPair<WritePSADBW, [SKXPort5], 3, [1], 1, 5>; // Vector PSADBW. +defm : SKXWriteResPair<WritePSADBWX, [SKXPort5], 3, [1], 1, 6>; +defm : SKXWriteResPair<WritePSADBWY, [SKXPort5], 3, [1], 1, 7>; +defm : SKXWriteResPair<WritePSADBWZ, [SKXPort5], 3, [1], 1, 7>; +defm : SKXWriteResPair<WritePHMINPOS, [SKXPort0], 4, [1], 1, 6>; // Vector PHMINPOS. + +// Vector integer shifts. +defm : SKXWriteResPair<WriteVecShift, [SKXPort0], 1, [1], 1, 5>; +defm : X86WriteRes<WriteVecShiftX, [SKXPort5,SKXPort01], 2, [1,1], 2>; +defm : X86WriteRes<WriteVecShiftY, [SKXPort5,SKXPort01], 4, [1,1], 2>; +defm : X86WriteRes<WriteVecShiftZ, [SKXPort5,SKXPort0], 4, [1,1], 2>; +defm : X86WriteRes<WriteVecShiftXLd, [SKXPort01,SKXPort23], 7, [1,1], 2>; +defm : X86WriteRes<WriteVecShiftYLd, [SKXPort01,SKXPort23], 8, [1,1], 2>; +defm : X86WriteRes<WriteVecShiftZLd, [SKXPort0,SKXPort23], 8, [1,1], 2>; + +defm : SKXWriteResPair<WriteVecShiftImm, [SKXPort0], 1, [1], 1, 5>; +defm : SKXWriteResPair<WriteVecShiftImmX, [SKXPort01], 1, [1], 1, 6>; // Vector integer immediate shifts. +defm : SKXWriteResPair<WriteVecShiftImmY, [SKXPort01], 1, [1], 1, 7>; +defm : SKXWriteResPair<WriteVecShiftImmZ, [SKXPort0], 1, [1], 1, 7>; +defm : SKXWriteResPair<WriteVarVecShift, [SKXPort01], 1, [1], 1, 6>; // Variable vector shifts. +defm : SKXWriteResPair<WriteVarVecShiftY, [SKXPort01], 1, [1], 1, 7>; +defm : SKXWriteResPair<WriteVarVecShiftZ, [SKXPort0], 1, [1], 1, 7>; + +// Vector insert/extract operations. +def : WriteRes<WriteVecInsert, [SKXPort5]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def : WriteRes<WriteVecInsertLd, [SKXPort5,SKXPort23]> { + let Latency = 6; + let NumMicroOps = 2; +} +def: InstRW<[WriteVecInsertLd], (instregex "(V?)MOV(H|L)(PD|PS)rm")>; + +def : WriteRes<WriteVecExtract, [SKXPort0,SKXPort5]> { + let Latency = 3; + let NumMicroOps = 2; +} +def : WriteRes<WriteVecExtractSt, [SKXPort4,SKXPort5,SKXPort237]> { + let Latency = 2; + let NumMicroOps = 3; +} + +// Conversion between integer and float. +defm : SKXWriteResPair<WriteCvtSS2I, [SKXPort01], 6, [2], 2>; // Needs more work: DD vs DQ. +defm : SKXWriteResPair<WriteCvtPS2I, [SKXPort01], 3>; +defm : SKXWriteResPair<WriteCvtPS2IY, [SKXPort01], 3>; +defm : SKXWriteResPair<WriteCvtPS2IZ, [SKXPort05], 3>; +defm : SKXWriteResPair<WriteCvtSD2I, [SKXPort01], 6, [2], 2>; +defm : SKXWriteResPair<WriteCvtPD2I, [SKXPort01], 3>; +defm : SKXWriteResPair<WriteCvtPD2IY, [SKXPort01], 3>; +defm : SKXWriteResPair<WriteCvtPD2IZ, [SKXPort05], 3>; + +defm : SKXWriteResPair<WriteCvtI2SS, [SKXPort1], 4>; +defm : SKXWriteResPair<WriteCvtI2PS, [SKXPort01], 4>; +defm : SKXWriteResPair<WriteCvtI2PSY, [SKXPort01], 4>; +defm : SKXWriteResPair<WriteCvtI2PSZ, [SKXPort05], 4>; // Needs more work: DD vs DQ. +defm : SKXWriteResPair<WriteCvtI2SD, [SKXPort1], 4>; +defm : SKXWriteResPair<WriteCvtI2PD, [SKXPort01], 4>; +defm : SKXWriteResPair<WriteCvtI2PDY, [SKXPort01], 4>; +defm : SKXWriteResPair<WriteCvtI2PDZ, [SKXPort05], 4>; + +defm : SKXWriteResPair<WriteCvtSS2SD, [SKXPort1], 3>; +defm : SKXWriteResPair<WriteCvtPS2PD, [SKXPort1], 3>; +defm : SKXWriteResPair<WriteCvtPS2PDY, [SKXPort5,SKXPort01], 3, [1,1], 2>; +defm : SKXWriteResPair<WriteCvtPS2PDZ, [SKXPort05], 3, [2], 2>; +defm : SKXWriteResPair<WriteCvtSD2SS, [SKXPort1], 3>; +defm : SKXWriteResPair<WriteCvtPD2PS, [SKXPort1], 3>; +defm : SKXWriteResPair<WriteCvtPD2PSY, [SKXPort5,SKXPort01], 3, [1,1], 2>; +defm : SKXWriteResPair<WriteCvtPD2PSZ, [SKXPort05], 3, [2], 2>; + +defm : X86WriteRes<WriteCvtPH2PS, [SKXPort5,SKXPort01], 5, [1,1], 2>; +defm : X86WriteRes<WriteCvtPH2PSY, [SKXPort5,SKXPort01], 7, [1,1], 2>; +defm : X86WriteRes<WriteCvtPH2PSZ, [SKXPort5,SKXPort0], 7, [1,1], 2>; +defm : X86WriteRes<WriteCvtPH2PSLd, [SKXPort23,SKXPort01], 9, [1,1], 2>; +defm : X86WriteRes<WriteCvtPH2PSYLd, [SKXPort23,SKXPort01], 10, [1,1], 2>; +defm : X86WriteRes<WriteCvtPH2PSZLd, [SKXPort23,SKXPort05], 10, [1,1], 2>; + +defm : X86WriteRes<WriteCvtPS2PH, [SKXPort5,SKXPort01], 5, [1,1], 2>; +defm : X86WriteRes<WriteCvtPS2PHY, [SKXPort5,SKXPort01], 7, [1,1], 2>; +defm : X86WriteRes<WriteCvtPS2PHZ, [SKXPort5,SKXPort05], 7, [1,1], 2>; +defm : X86WriteRes<WriteCvtPS2PHSt, [SKXPort4,SKXPort5,SKXPort237,SKXPort01], 6, [1,1,1,1], 4>; +defm : X86WriteRes<WriteCvtPS2PHYSt, [SKXPort4,SKXPort5,SKXPort237,SKXPort01], 8, [1,1,1,1], 4>; +defm : X86WriteRes<WriteCvtPS2PHZSt, [SKXPort4,SKXPort5,SKXPort237,SKXPort05], 8, [1,1,1,1], 4>; + +// Strings instructions. + +// Packed Compare Implicit Length Strings, Return Mask +def : WriteRes<WritePCmpIStrM, [SKXPort0]> { + let Latency = 10; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def : WriteRes<WritePCmpIStrMLd, [SKXPort0, SKXPort23]> { + let Latency = 16; + let NumMicroOps = 4; + let ResourceCycles = [3,1]; +} + +// Packed Compare Explicit Length Strings, Return Mask +def : WriteRes<WritePCmpEStrM, [SKXPort0, SKXPort5, SKXPort015, SKXPort0156]> { + let Latency = 19; + let NumMicroOps = 9; + let ResourceCycles = [4,3,1,1]; +} +def : WriteRes<WritePCmpEStrMLd, [SKXPort0, SKXPort5, SKXPort23, SKXPort015, SKXPort0156]> { + let Latency = 25; + let NumMicroOps = 10; + let ResourceCycles = [4,3,1,1,1]; +} + +// Packed Compare Implicit Length Strings, Return Index +def : WriteRes<WritePCmpIStrI, [SKXPort0]> { + let Latency = 10; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def : WriteRes<WritePCmpIStrILd, [SKXPort0, SKXPort23]> { + let Latency = 16; + let NumMicroOps = 4; + let ResourceCycles = [3,1]; +} + +// Packed Compare Explicit Length Strings, Return Index +def : WriteRes<WritePCmpEStrI, [SKXPort0,SKXPort5,SKXPort0156]> { + let Latency = 18; + let NumMicroOps = 8; + let ResourceCycles = [4,3,1]; +} +def : WriteRes<WritePCmpEStrILd, [SKXPort0, SKXPort5, SKXPort23, SKXPort0156]> { + let Latency = 24; + let NumMicroOps = 9; + let ResourceCycles = [4,3,1,1]; +} + +// MOVMSK Instructions. +def : WriteRes<WriteFMOVMSK, [SKXPort0]> { let Latency = 2; } +def : WriteRes<WriteVecMOVMSK, [SKXPort0]> { let Latency = 2; } +def : WriteRes<WriteVecMOVMSKY, [SKXPort0]> { let Latency = 2; } +def : WriteRes<WriteMMXMOVMSK, [SKXPort0]> { let Latency = 2; } + +// AES instructions. +def : WriteRes<WriteAESDecEnc, [SKXPort0]> { // Decryption, encryption. + let Latency = 4; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def : WriteRes<WriteAESDecEncLd, [SKXPort0, SKXPort23]> { + let Latency = 10; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} + +def : WriteRes<WriteAESIMC, [SKXPort0]> { // InvMixColumn. + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def : WriteRes<WriteAESIMCLd, [SKXPort0, SKXPort23]> { + let Latency = 14; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} + +def : WriteRes<WriteAESKeyGen, [SKXPort0,SKXPort5,SKXPort015]> { // Key Generation. + let Latency = 20; + let NumMicroOps = 11; + let ResourceCycles = [3,6,2]; +} +def : WriteRes<WriteAESKeyGenLd, [SKXPort0,SKXPort5,SKXPort23,SKXPort015]> { + let Latency = 25; + let NumMicroOps = 11; + let ResourceCycles = [3,6,1,1]; +} + +// Carry-less multiplication instructions. +def : WriteRes<WriteCLMul, [SKXPort5]> { + let Latency = 6; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def : WriteRes<WriteCLMulLd, [SKXPort5, SKXPort23]> { + let Latency = 12; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} + +// Catch-all for expensive system instructions. +def : WriteRes<WriteSystem, [SKXPort0156]> { let Latency = 100; } // def WriteSystem : SchedWrite; + +// AVX2. +defm : SKXWriteResPair<WriteFShuffle256, [SKXPort5], 3, [1], 1, 7>; // Fp 256-bit width vector shuffles. +defm : SKXWriteResPair<WriteFVarShuffle256, [SKXPort5], 3, [1], 1, 7>; // Fp 256-bit width vector variable shuffles. +defm : SKXWriteResPair<WriteShuffle256, [SKXPort5], 3, [1], 1, 7>; // 256-bit width vector shuffles. +defm : SKXWriteResPair<WriteVarShuffle256, [SKXPort5], 3, [1], 1, 7>; // 256-bit width vector variable shuffles. + +// Old microcoded instructions that nobody use. +def : WriteRes<WriteMicrocoded, [SKXPort0156]> { let Latency = 100; } // def WriteMicrocoded : SchedWrite; + +// Fence instructions. +def : WriteRes<WriteFence, [SKXPort23, SKXPort4]>; + +// Load/store MXCSR. +def : WriteRes<WriteLDMXCSR, [SKXPort0,SKXPort23,SKXPort0156]> { let Latency = 7; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; } +def : WriteRes<WriteSTMXCSR, [SKXPort4,SKXPort5,SKXPort237]> { let Latency = 2; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; } + +// Nop, not very useful expect it provides a model for nops! +def : WriteRes<WriteNop, []>; + +//////////////////////////////////////////////////////////////////////////////// +// Horizontal add/sub instructions. +//////////////////////////////////////////////////////////////////////////////// + +defm : SKXWriteResPair<WriteFHAdd, [SKXPort5,SKXPort015], 6, [2,1], 3, 6>; +defm : SKXWriteResPair<WriteFHAddY, [SKXPort5,SKXPort015], 6, [2,1], 3, 7>; +defm : SKXWriteResPair<WritePHAdd, [SKXPort5,SKXPort05], 3, [2,1], 3, 5>; +defm : SKXWriteResPair<WritePHAddX, [SKXPort5,SKXPort015], 3, [2,1], 3, 6>; +defm : SKXWriteResPair<WritePHAddY, [SKXPort5,SKXPort015], 3, [2,1], 3, 7>; + +// Remaining instrs. + +def SKXWriteResGroup1 : SchedWriteRes<[SKXPort0]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup1], (instregex "KAND(B|D|Q|W)rr", + "KANDN(B|D|Q|W)rr", + "KMOV(B|D|Q|W)kk", + "KNOT(B|D|Q|W)rr", + "KOR(B|D|Q|W)rr", + "KXNOR(B|D|Q|W)rr", + "KXOR(B|D|Q|W)rr", + "MMX_PADDS(B|W)irr", + "MMX_PADDUS(B|W)irr", + "MMX_PAVG(B|W)irr", + "MMX_PCMPEQ(B|D|W)irr", + "MMX_PCMPGT(B|D|W)irr", + "MMX_P(MAX|MIN)SWirr", + "MMX_P(MAX|MIN)UBirr", + "MMX_PSUBS(B|W)irr", + "MMX_PSUBUS(B|W)irr", + "VPMOVB2M(Z|Z128|Z256)rr", + "VPMOVD2M(Z|Z128|Z256)rr", + "VPMOVQ2M(Z|Z128|Z256)rr", + "VPMOVW2M(Z|Z128|Z256)rr")>; + +def SKXWriteResGroup3 : SchedWriteRes<[SKXPort5]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup3], (instregex "COM(P?)_FST0r", + "KMOV(B|D|Q|W)kr", + "UCOM_F(P?)r")>; + +def SKXWriteResGroup4 : SchedWriteRes<[SKXPort6]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup4], (instregex "JMP(16|32|64)r")>; + +def SKXWriteResGroup6 : SchedWriteRes<[SKXPort05]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup6], (instrs FINCSTP, FNOP)>; + +def SKXWriteResGroup7 : SchedWriteRes<[SKXPort06]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup7], (instrs CDQ, CQO, CLAC, STAC)>; + +def SKXWriteResGroup8 : SchedWriteRes<[SKXPort15]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup8], (instregex "ANDN(32|64)rr", + "BLSI(32|64)rr", + "BLSMSK(32|64)rr", + "BLSR(32|64)rr")>; + +def SKXWriteResGroup9 : SchedWriteRes<[SKXPort015]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDMPD(Z128|Z256)rr", + "VBLENDMPS(Z128|Z256)rr", + "VPADD(B|D|Q|W)(Y|Z|Z128|Z256)rr", + "(V?)PADD(B|D|Q|W)rr", + "VPBLENDD(Y?)rri", + "VPBLENDMB(Z128|Z256)rr", + "VPBLENDMD(Z128|Z256)rr", + "VPBLENDMQ(Z128|Z256)rr", + "VPBLENDMW(Z128|Z256)rr", + "VPSUB(B|D|Q|W)(Y|Z|Z128|Z256)rr", + "(V?)PSUB(B|D|Q|W)rr", + "VPTERNLOGD(Z|Z128|Z256)rri", + "VPTERNLOGQ(Z|Z128|Z256)rri")>; + +def SKXWriteResGroup10 : SchedWriteRes<[SKXPort0156]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup10], (instrs CBW, CWDE, CDQE, + CMC, STC)>; +def: InstRW<[SKXWriteResGroup10], (instregex "SGDT64m", + "SIDT64m", + "SMSW16m", + "STRm", + "SYSCALL")>; + +def SKXWriteResGroup11 : SchedWriteRes<[SKXPort4,SKXPort237]> { + let Latency = 1; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup11], (instregex "FBSTPm", + "KMOV(B|D|Q|W)mk", + "ST_FP(32|64|80)m", + "VMPTRSTm")>; + +def SKXWriteResGroup13 : SchedWriteRes<[SKXPort5]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKXWriteResGroup13], (instregex "MMX_MOVQ2DQrr")>; + +def SKXWriteResGroup14 : SchedWriteRes<[SKXPort05]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKXWriteResGroup14], (instrs FDECSTP)>; +def: InstRW<[SKXWriteResGroup14], (instregex "MMX_MOVDQ2Qrr")>; + +def SKXWriteResGroup15 : SchedWriteRes<[SKXPort06]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKXWriteResGroup15], (instregex "ROL(8|16|32|64)r1", + "ROL(8|16|32|64)ri", + "ROR(8|16|32|64)r1", + "ROR(8|16|32|64)ri", + "SET(A|BE)r")>; + +def SKXWriteResGroup17 : SchedWriteRes<[SKXPort0156]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKXWriteResGroup17], (instrs LFENCE, + WAIT, + XGETBV)>; + +def SKXWriteResGroup20 : SchedWriteRes<[SKXPort6,SKXPort0156]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup20], (instregex "CLFLUSH")>; + +def SKXWriteResGroup21 : SchedWriteRes<[SKXPort237,SKXPort0156]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup21], (instrs SFENCE)>; + +def SKXWriteResGroup23 : SchedWriteRes<[SKXPort06,SKXPort0156]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup23], (instrs CWD)>; +def: InstRW<[SKXWriteResGroup23], (instrs JCXZ, JECXZ, JRCXZ)>; +def: InstRW<[SKXWriteResGroup23], (instregex "ADC8i8", + "ADC8ri", + "SBB8i8", + "SBB8ri")>; + +def SKXWriteResGroup25 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort237]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup25], (instrs FNSTCW16m)>; + +def SKXWriteResGroup27 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort15]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup27], (instregex "MOVBE(16|32|64)mr")>; + +def SKXWriteResGroup28 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort0156]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup28], (instrs PUSH16r, PUSH32r, PUSH64r, + STOSB, STOSL, STOSQ, STOSW)>; +def: InstRW<[SKXWriteResGroup28], (instregex "PUSH(16|32|64)rmr", + "PUSH64i8")>; + +def SKXWriteResGroup29 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort15]> { + let Latency = 2; + let NumMicroOps = 5; + let ResourceCycles = [2,2,1]; +} +def: InstRW<[SKXWriteResGroup29], (instregex "VMOVDQU8Zmr(b?)")>; + +def SKXWriteResGroup30 : SchedWriteRes<[SKXPort0]> { + let Latency = 3; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup30], (instregex "KMOV(B|D|Q|W)rk", + "KORTEST(B|D|Q|W)rr", + "KTEST(B|D|Q|W)rr")>; + +def SKXWriteResGroup31 : SchedWriteRes<[SKXPort1]> { + let Latency = 3; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup31], (instregex "PDEP(32|64)rr", + "PEXT(32|64)rr")>; + +def SKXWriteResGroup31_16i : SchedWriteRes<[SKXPort1, SKXPort0156]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup31_16i], (instrs IMUL16rri, IMUL16rri8)>; + + +def SKXWriteResGroup32 : SchedWriteRes<[SKXPort5]> { + let Latency = 3; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup32], (instregex "(ADD|SUB|SUBR)_(FPrST0|FST0r|FrST0)", + "KADD(B|D|Q|W)rr", + "KSHIFTL(B|D|Q|W)ri", + "KSHIFTR(B|D|Q|W)ri", + "KUNPCKBWrr", + "KUNPCKDQrr", + "KUNPCKWDrr", + "VALIGND(Z|Z128|Z256)rri", + "VALIGNQ(Z|Z128|Z256)rri", + "VCMPPD(Z|Z128|Z256)rri", + "VCMPPS(Z|Z128|Z256)rri", + "VCMPSDZrr", + "VCMPSSZrr", + "VDBPSADBWZrri", // TODO: 512-bit ops require ports 0/1 to be joined. + "VFPCLASSPD(Z|Z128|Z256)rr", + "VFPCLASSPS(Z|Z128|Z256)rr", + "VFPCLASSSDZrr", + "VFPCLASSSSZrr", + "VPBROADCASTBrr", + "VPBROADCASTWrr", + "VPCMPB(Z|Z128|Z256)rri", + "VPCMPD(Z|Z128|Z256)rri", + "VPCMPEQ(B|D|Q|W)(Z|Z128|Z256)rr", + "VPCMPGT(B|D|Q|W)(Z|Z128|Z256)rr", + "(V?)PCMPGTQ(Y?)rr", + "VPCMPQ(Z|Z128|Z256)rri", + "VPCMPU(B|D|Q|W)(Z|Z128|Z256)rri", + "VPCMPW(Z|Z128|Z256)rri", + "VP(MAX|MIN)(S|U)Q(Z|Z128|Z256)rr", + "VPSADBWZrr", // TODO: 512-bit ops require ports 0/1 to be joined. + "VPTEST(N?)M(B|D|Q|W)(Z|Z128|Z256)rr")>; + +def SKXWriteResGroup34 : SchedWriteRes<[SKXPort0,SKXPort0156]> { + let Latency = 3; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup34], (instrs FNSTSW16r)>; + +def SKXWriteResGroup35 : SchedWriteRes<[SKXPort06]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def: InstRW<[SKXWriteResGroup35], (instregex "ROL(8|16|32|64)rCL", + "ROR(8|16|32|64)rCL", + "SAR(8|16|32|64)rCL", + "SHL(8|16|32|64)rCL", + "SHR(8|16|32|64)rCL")>; + +def SKXWriteResGroup36 : SchedWriteRes<[SKXPort0156]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def: InstRW<[SKXWriteResGroup36], (instrs XADD8rr, XADD16rr, XADD32rr, XADD64rr, + XCHG8rr, XCHG16rr, XCHG32rr, XCHG64rr, + XCHG16ar, XCHG32ar, XCHG64ar)>; + +def SKXWriteResGroup37 : SchedWriteRes<[SKXPort0,SKXPort5]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKXWriteResGroup37], (instregex "MMX_PH(ADD|SUB)SWrr")>; + +def SKXWriteResGroup38 : SchedWriteRes<[SKXPort5,SKXPort01]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKXWriteResGroup38], (instregex "(V?)PH(ADD|SUB)SW(Y?)rr")>; + +def SKXWriteResGroup41 : SchedWriteRes<[SKXPort5,SKXPort0156]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKXWriteResGroup41], (instregex "MMX_PACKSSDWirr", + "MMX_PACKSSWBirr", + "MMX_PACKUSWBirr")>; + +def SKXWriteResGroup42 : SchedWriteRes<[SKXPort6,SKXPort0156]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKXWriteResGroup42], (instregex "CLD")>; + +def SKXWriteResGroup43 : SchedWriteRes<[SKXPort237,SKXPort0156]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKXWriteResGroup43], (instrs MFENCE)>; + +def SKXWriteResGroup44 : SchedWriteRes<[SKXPort06,SKXPort0156]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKXWriteResGroup44], (instregex "RCL(8|16|32|64)r1", + "RCL(8|16|32|64)ri", + "RCR(8|16|32|64)r1", + "RCR(8|16|32|64)ri")>; + +def SKXWriteResGroup45 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort237]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup45], (instrs FNSTSWm)>; + +def SKXWriteResGroup46 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort06]> { + let Latency = 3; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; +} +def: InstRW<[SKXWriteResGroup46], (instregex "SET(A|BE)m")>; + +def SKXWriteResGroup47 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort237,SKXPort0156]> { + let Latency = 3; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKXWriteResGroup47], (instregex "CALL(16|32|64)r")>; + +def SKXWriteResGroup48 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort06,SKXPort0156]> { + let Latency = 3; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKXWriteResGroup48], (instrs CALL64pcrel32)>; + +def SKXWriteResGroup49 : SchedWriteRes<[SKXPort0]> { + let Latency = 4; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup49], (instregex "MUL_(FPrST0|FST0r|FrST0)")>; + +def SKXWriteResGroup50 : SchedWriteRes<[SKXPort01]> { + let Latency = 4; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTDQ2PS(Y|Z128|Z256)rr", + "(V?)CVTDQ2PSrr", + "VCVTPD2QQ(Z128|Z256)rr", + "VCVTPD2UQQ(Z128|Z256)rr", + "VCVTPS2DQ(Y|Z128|Z256)rr", + "(V?)CVTPS2DQrr", + "VCVTPS2UDQ(Z128|Z256)rr", + "VCVTQQ2PD(Z128|Z256)rr", + "VCVTTPD2QQ(Z128|Z256)rr", + "VCVTTPD2UQQ(Z128|Z256)rr", + "VCVTTPS2DQ(Z128|Z256)rr", + "(V?)CVTTPS2DQrr", + "VCVTTPS2UDQ(Z128|Z256)rr", + "VCVTUDQ2PS(Z128|Z256)rr", + "VCVTUQQ2PD(Z128|Z256)rr")>; + +def SKXWriteResGroup50z : SchedWriteRes<[SKXPort05]> { + let Latency = 4; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup50z], (instrs VCVTDQ2PSZrr, + VCVTPD2QQZrr, + VCVTPD2UQQZrr, + VCVTPS2DQZrr, + VCVTPS2UDQZrr, + VCVTQQ2PDZrr, + VCVTTPD2QQZrr, + VCVTTPD2UQQZrr, + VCVTTPS2DQZrr, + VCVTTPS2UDQZrr, + VCVTUDQ2PSZrr, + VCVTUQQ2PDZrr)>; + +def SKXWriteResGroup51 : SchedWriteRes<[SKXPort5]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKXWriteResGroup51], (instregex "VEXPANDPD(Z|Z128|Z256)rr", + "VEXPANDPS(Z|Z128|Z256)rr", + "VPEXPANDD(Z|Z128|Z256)rr", + "VPEXPANDQ(Z|Z128|Z256)rr", + "VPMOVDB(Z|Z128|Z256)rr", + "VPMOVDW(Z|Z128|Z256)rr", + "VPMOVQB(Z|Z128|Z256)rr", + "VPMOVQW(Z|Z128|Z256)rr", + "VPMOVSDB(Z|Z128|Z256)rr", + "VPMOVSDW(Z|Z128|Z256)rr", + "VPMOVSQB(Z|Z128|Z256)rr", + "VPMOVSQD(Z|Z128|Z256)rr", + "VPMOVSQW(Z|Z128|Z256)rr", + "VPMOVSWB(Z|Z128|Z256)rr", + "VPMOVUSDB(Z|Z128|Z256)rr", + "VPMOVUSDW(Z|Z128|Z256)rr", + "VPMOVUSQB(Z|Z128|Z256)rr", + "VPMOVUSQD(Z|Z128|Z256)rr", + "VPMOVUSWB(Z|Z128|Z256)rr", + "VPMOVWB(Z|Z128|Z256)rr")>; + +def SKXWriteResGroup52 : SchedWriteRes<[SKXPort1,SKXPort5]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup52], (instrs IMUL64r, MUL64r, MULX64rr)>; + +def SKXWriteResGroup52_16 : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort0156]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; +} +def: InstRW<[SKXWriteResGroup52_16], (instrs IMUL16r, MUL16r)>; + +def SKXWriteResGroup54 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort237]> { + let Latency = 4; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup54], (instregex "IST(T?)_FP(16|32|64)m", + "IST_F(16|32)m", + "VPMOVQD(Z|Z128|Z256)mr(b?)")>; + +def SKXWriteResGroup55 : SchedWriteRes<[SKXPort0156]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [4]; +} +def: InstRW<[SKXWriteResGroup55], (instrs FNCLEX)>; + +def SKXWriteResGroup56 : SchedWriteRes<[SKXPort015,SKXPort0156]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [1,3]; +} +def: InstRW<[SKXWriteResGroup56], (instrs VZEROUPPER)>; + +def SKXWriteResGroup57 : SchedWriteRes<[SKXPort1,SKXPort6,SKXPort0156]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; +} +def: InstRW<[SKXWriteResGroup57], (instregex "LAR(16|32|64)rr")>; + +def SKXWriteResGroup58 : SchedWriteRes<[SKXPort23]> { + let Latency = 5; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup58], (instregex "MOVSX(16|32|64)rm16", + "MOVSX(16|32|64)rm32", + "MOVSX(16|32|64)rm8", + "MOVZX(16|32|64)rm16", + "MOVZX(16|32|64)rm8", + "(V?)MOVDDUPrm")>; // TODO: Should this be SKXWriteResGroup71? + +def SKXWriteResGroup61 : SchedWriteRes<[SKXPort5,SKXPort015]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup61], (instregex "MMX_CVT(T?)PD2PIirr", + "MMX_CVT(T?)PS2PIirr", + "VCVTDQ2PDZ128rr", + "VCVTPD2DQZ128rr", + "(V?)CVT(T?)PD2DQrr", + "VCVTPD2PSZ128rr", + "(V?)CVTPD2PSrr", + "VCVTPD2UDQZ128rr", + "VCVTPS2PDZ128rr", + "(V?)CVTPS2PDrr", + "VCVTPS2QQZ128rr", + "VCVTPS2UQQZ128rr", + "VCVTQQ2PSZ128rr", + "(V?)CVTSD2SS(Z?)rr", + "(V?)CVTSI(64)?2SDrr", + "VCVTSI2SSZrr", + "(V?)CVTSI2SSrr", + "VCVTSI(64)?2SDZrr", + "VCVTSS2SDZrr", + "(V?)CVTSS2SDrr", + "VCVTTPD2DQZ128rr", + "VCVTTPD2UDQZ128rr", + "VCVTTPS2QQZ128rr", + "VCVTTPS2UQQZ128rr", + "VCVTUDQ2PDZ128rr", + "VCVTUQQ2PSZ128rr", + "VCVTUSI2SSZrr", + "VCVTUSI(64)?2SDZrr")>; + +def SKXWriteResGroup62 : SchedWriteRes<[SKXPort5,SKXPort015]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKXWriteResGroup62], (instregex "VPCONFLICTQZ128rr")>; + +def SKXWriteResGroup63 : SchedWriteRes<[SKXPort1,SKXPort6,SKXPort06]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup63], (instregex "STR(16|32|64)r")>; + +def SKXWriteResGroup64 : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort0156]> { + let Latency = 4; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup64], (instrs IMUL32r, MUL32r, MULX32rr)>; + +def SKXWriteResGroup65 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort015]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup65], (instregex "VCVTPS2PHZ128mr(b?)", + "VCVTPS2PHZ256mr(b?)", + "VCVTPS2PHZmr(b?)")>; + +def SKXWriteResGroup66 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort237]> { + let Latency = 5; + let NumMicroOps = 4; + let ResourceCycles = [1,2,1]; +} +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVDB(Z|Z128|Z256)mr(b?)", + "VPMOVDW(Z|Z128|Z256)mr(b?)", + "VPMOVQB(Z|Z128|Z256)mr(b?)", + "VPMOVQW(Z|Z128|Z256)mr(b?)", + "VPMOVSDB(Z|Z128|Z256)mr(b?)", + "VPMOVSDW(Z|Z128|Z256)mr(b?)", + "VPMOVSQB(Z|Z128|Z256)mr(b?)", + "VPMOVSQD(Z|Z128|Z256)mr(b?)", + "VPMOVSQW(Z|Z128|Z256)mr(b?)", + "VPMOVSWB(Z|Z128|Z256)mr(b?)", + "VPMOVUSDB(Z|Z128|Z256)mr(b?)", + "VPMOVUSDW(Z|Z128|Z256)mr(b?)", + "VPMOVUSQB(Z|Z128|Z256)mr(b?)", + "VPMOVUSQD(Z|Z128|Z256)mr(b?)", + "VPMOVUSQW(Z|Z128|Z256)mr(b?)", + "VPMOVUSWB(Z|Z128|Z256)mr(b?)", + "VPMOVWB(Z|Z128|Z256)mr(b?)")>; + +def SKXWriteResGroup67 : SchedWriteRes<[SKXPort06,SKXPort0156]> { + let Latency = 5; + let NumMicroOps = 5; + let ResourceCycles = [1,4]; +} +def: InstRW<[SKXWriteResGroup67], (instrs XSETBV)>; + +def SKXWriteResGroup68 : SchedWriteRes<[SKXPort06,SKXPort0156]> { + let Latency = 5; + let NumMicroOps = 5; + let ResourceCycles = [2,3]; +} +def: InstRW<[SKXWriteResGroup68], (instregex "CMPXCHG(8|16|32|64)rr")>; + +def SKXWriteResGroup69 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort0156]> { + let Latency = 5; + let NumMicroOps = 6; + let ResourceCycles = [1,1,4]; +} +def: InstRW<[SKXWriteResGroup69], (instregex "PUSHF(16|64)")>; + +def SKXWriteResGroup71 : SchedWriteRes<[SKXPort23]> { + let Latency = 6; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup71], (instregex "VBROADCASTSSrm", + "(V?)MOVSHDUPrm", + "(V?)MOVSLDUPrm", + "VPBROADCASTDrm", + "VPBROADCASTQrm")>; + +def SKXWriteResGroup72 : SchedWriteRes<[SKXPort5]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKXWriteResGroup72], (instregex "MMX_CVTPI2PSirr", + "VCOMPRESSPD(Z|Z128|Z256)rr", + "VCOMPRESSPS(Z|Z128|Z256)rr", + "VPCOMPRESSD(Z|Z128|Z256)rr", + "VPCOMPRESSQ(Z|Z128|Z256)rr", + "VPERMW(Z|Z128|Z256)rr")>; + +def SKXWriteResGroup73 : SchedWriteRes<[SKXPort0,SKXPort23]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PADDSBirm", + "MMX_PADDSWirm", + "MMX_PADDUSBirm", + "MMX_PADDUSWirm", + "MMX_PAVGBirm", + "MMX_PAVGWirm", + "MMX_PCMPEQBirm", + "MMX_PCMPEQDirm", + "MMX_PCMPEQWirm", + "MMX_PCMPGTBirm", + "MMX_PCMPGTDirm", + "MMX_PCMPGTWirm", + "MMX_PMAXSWirm", + "MMX_PMAXUBirm", + "MMX_PMINSWirm", + "MMX_PMINUBirm", + "MMX_PSUBSBirm", + "MMX_PSUBSWirm", + "MMX_PSUBUSBirm", + "MMX_PSUBUSWirm")>; + +def SKXWriteResGroup76 : SchedWriteRes<[SKXPort6,SKXPort23]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup76], (instregex "FARJMP64", + "JMP(16|32|64)m")>; + +def SKXWriteResGroup78 : SchedWriteRes<[SKXPort23,SKXPort06]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup78], (instregex "BT(16|32|64)mi8")>; + +def SKXWriteResGroup79 : SchedWriteRes<[SKXPort23,SKXPort15]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup79], (instregex "ANDN(32|64)rm", + "BLSI(32|64)rm", + "BLSMSK(32|64)rm", + "BLSR(32|64)rm", + "MOVBE(16|32|64)rm")>; + +def SKXWriteResGroup80 : SchedWriteRes<[SKXPort23,SKXPort015]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup80], (instregex "VMOV(64to|QI2)PQIZrm(b?)", + "VMOVDI2PDIZrm(b?)")>; + +def SKXWriteResGroup81 : SchedWriteRes<[SKXPort23,SKXPort0156]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup81], (instrs POP16r, POP32r, POP64r)>; +def: InstRW<[SKXWriteResGroup81], (instregex "POP(16|32|64)rmr")>; + +def SKXWriteResGroup82 : SchedWriteRes<[SKXPort5,SKXPort015]> { + let Latency = 6; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKXWriteResGroup82], (instregex "(V?)CVTSI642SSrr", + "VCVTSI642SSZrr", + "VCVTUSI642SSZrr")>; + +def SKXWriteResGroup84 : SchedWriteRes<[SKXPort1,SKXPort6,SKXPort06,SKXPort0156]> { + let Latency = 6; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKXWriteResGroup84], (instregex "SLDT(16|32|64)r")>; + +def SKXWriteResGroup86 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort06]> { + let Latency = 6; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKXWriteResGroup86], (instregex "BTC(16|32|64)mi8", + "BTR(16|32|64)mi8", + "BTS(16|32|64)mi8", + "SAR(8|16|32|64)m1", + "SAR(8|16|32|64)mi", + "SHL(8|16|32|64)m1", + "SHL(8|16|32|64)mi", + "SHR(8|16|32|64)m1", + "SHR(8|16|32|64)mi")>; + +def SKXWriteResGroup87 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort0156]> { + let Latency = 6; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKXWriteResGroup87], (instregex "POP(16|32|64)rmm", + "PUSH(16|32|64)rmm")>; + +def SKXWriteResGroup88 : SchedWriteRes<[SKXPort6,SKXPort0156]> { + let Latency = 6; + let NumMicroOps = 6; + let ResourceCycles = [1,5]; +} +def: InstRW<[SKXWriteResGroup88], (instrs STD)>; + +def SKXWriteResGroup89 : SchedWriteRes<[SKXPort23]> { + let Latency = 7; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup89], (instregex "LD_F(32|64|80)m", + "VBROADCASTF128", + "VBROADCASTI128", + "VBROADCASTSDYrm", + "VBROADCASTSSYrm", + "VMOVDDUPYrm", + "VMOVSHDUPYrm", + "VMOVSLDUPYrm", + "VPBROADCASTDYrm", + "VPBROADCASTQYrm")>; + +def SKXWriteResGroup90 : SchedWriteRes<[SKXPort01,SKXPort5]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup90], (instregex "VCVTDQ2PDYrr")>; + +def SKXWriteResGroup92 : SchedWriteRes<[SKXPort5,SKXPort23]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup92], (instregex "VMOVSDZrm(b?)", + "VMOVSSZrm(b?)")>; + +def SKXWriteResGroup92a : SchedWriteRes<[SKXPort5,SKXPort23]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup92a], (instregex "(V?)PMOV(SX|ZX)BDrm", + "(V?)PMOV(SX|ZX)BQrm", + "(V?)PMOV(SX|ZX)BWrm", + "(V?)PMOV(SX|ZX)DQrm", + "(V?)PMOV(SX|ZX)WDrm", + "(V?)PMOV(SX|ZX)WQrm")>; + +def SKXWriteResGroup93 : SchedWriteRes<[SKXPort5,SKXPort015]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup93], (instregex "VCVTDQ2PDZ256rr", + "VCVTPD2DQ(Y|Z256)rr", + "VCVTPD2PS(Y|Z256)rr", + "VCVTPD2UDQZ256rr", + "VCVTPS2PD(Y|Z256)rr", + "VCVTPS2QQZ256rr", + "VCVTPS2UQQZ256rr", + "VCVTQQ2PSZ256rr", + "VCVTTPD2DQ(Y|Z256)rr", + "VCVTTPD2UDQZ256rr", + "VCVTTPS2QQZ256rr", + "VCVTTPS2UQQZ256rr", + "VCVTUDQ2PDZ256rr", + "VCVTUQQ2PSZ256rr")>; + +def SKXWriteResGroup93z : SchedWriteRes<[SKXPort5,SKXPort05]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup93z], (instrs VCVTDQ2PDZrr, + VCVTPD2DQZrr, + VCVTPD2PSZrr, + VCVTPD2UDQZrr, + VCVTPS2PDZrr, + VCVTPS2QQZrr, + VCVTPS2UQQZrr, + VCVTQQ2PSZrr, + VCVTTPD2DQZrr, + VCVTTPD2UDQZrr, + VCVTTPS2QQZrr, + VCVTTPS2UQQZrr, + VCVTUDQ2PDZrr, + VCVTUQQ2PSZrr)>; + +def SKXWriteResGroup95 : SchedWriteRes<[SKXPort23,SKXPort015]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup95], (instregex "VBLENDMPDZ128rm(b?)", + "VBLENDMPSZ128rm(b?)", + "VBROADCASTI32X2Z128m(b?)", + "VBROADCASTSSZ128m(b?)", + "VINSERTF128rm", + "VINSERTI128rm", + "VMOVAPDZ128rm(b?)", + "VMOVAPSZ128rm(b?)", + "VMOVDDUPZ128rm(b?)", + "VMOVDQA32Z128rm(b?)", + "VMOVDQA64Z128rm(b?)", + "VMOVDQU16Z128rm(b?)", + "VMOVDQU32Z128rm(b?)", + "VMOVDQU64Z128rm(b?)", + "VMOVDQU8Z128rm(b?)", + "VMOVNTDQAZ128rm(b?)", + "VMOVSHDUPZ128rm(b?)", + "VMOVSLDUPZ128rm(b?)", + "VMOVUPDZ128rm(b?)", + "VMOVUPSZ128rm(b?)", + "VPADD(B|D|Q|W)Z128rm(b?)", + "(V?)PADD(B|D|Q|W)rm", + "VPBLENDDrmi", + "VPBLENDM(B|D|Q|W)Z128rm(b?)", + "VPBROADCASTDZ128m(b?)", + "VPBROADCASTQZ128m(b?)", + "VPSUB(B|D|Q|W)Z128rm(b?)", + "(V?)PSUB(B|D|Q|W)rm", + "VPTERNLOGDZ128rm(b?)i", + "VPTERNLOGQZ128rm(b?)i")>; + +def SKXWriteResGroup96 : SchedWriteRes<[SKXPort5,SKXPort23]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKXWriteResGroup96], (instregex "MMX_PACKSSDWirm", + "MMX_PACKSSWBirm", + "MMX_PACKUSWBirm")>; + +def SKXWriteResGroup97 : SchedWriteRes<[SKXPort5,SKXPort015]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKXWriteResGroup97], (instregex "VPERMI2W128rr", + "VPERMI2W256rr", + "VPERMI2Wrr", + "VPERMT2W128rr", + "VPERMT2W256rr", + "VPERMT2Wrr")>; + +def SKXWriteResGroup99 : SchedWriteRes<[SKXPort23,SKXPort0156]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKXWriteResGroup99], (instrs LEAVE, LEAVE64, + SCASB, SCASL, SCASQ, SCASW)>; + +def SKXWriteResGroup100 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort015]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup100], (instregex "VCVTSS2USI64Zrr", + "(V?)CVTSS2SI64(Z?)rr", + "(V?)CVTTSS2SI64(Z?)rr", + "VCVTTSS2USI64Zrr")>; + +def SKXWriteResGroup101 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort05]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup101], (instrs FLDCW16m)>; + +def SKXWriteResGroup103 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort0156]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup103], (instregex "KMOV(B|D|Q|W)km")>; + +def SKXWriteResGroup104 : SchedWriteRes<[SKXPort6,SKXPort23,SKXPort0156]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup104], (instrs LRETQ, RETQ)>; + +def SKXWriteResGroup106 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort237]> { + let Latency = 7; + let NumMicroOps = 4; + let ResourceCycles = [1,2,1]; +} +def: InstRW<[SKXWriteResGroup106], (instregex "VCOMPRESSPD(Z|Z128|Z256)mr(b?)", + "VCOMPRESSPS(Z|Z128|Z256)mr(b?)", + "VPCOMPRESSD(Z|Z128|Z256)mr(b?)", + "VPCOMPRESSQ(Z|Z128|Z256)mr(b?)")>; + +def SKXWriteResGroup107 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort06]> { + let Latency = 7; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,2]; +} +def: InstRW<[SKXWriteResGroup107], (instregex "ROL(8|16|32|64)m1", + "ROL(8|16|32|64)mi", + "ROR(8|16|32|64)m1", + "ROR(8|16|32|64)mi")>; + +def SKXWriteResGroup108 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort0156]> { + let Latency = 7; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,2]; +} +def: InstRW<[SKXWriteResGroup108], (instregex "XADD(8|16|32|64)rm")>; + +def SKXWriteResGroup109 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort23,SKXPort237,SKXPort0156]> { + let Latency = 7; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,1,1]; +} +def: InstRW<[SKXWriteResGroup109], (instregex "CALL(16|32|64)m", + "FARCALL64")>; + +def SKXWriteResGroup110 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort237,SKXPort0156]> { + let Latency = 7; + let NumMicroOps = 7; + let ResourceCycles = [1,2,2,2]; +} +def: InstRW<[SKXWriteResGroup110], (instrs VPSCATTERDQZ128mr, + VPSCATTERQQZ128mr, + VSCATTERDPDZ128mr, + VSCATTERQPDZ128mr)>; + +def SKXWriteResGroup111 : SchedWriteRes<[SKXPort6,SKXPort06,SKXPort15,SKXPort0156]> { + let Latency = 7; + let NumMicroOps = 7; + let ResourceCycles = [1,3,1,2]; +} +def: InstRW<[SKXWriteResGroup111], (instrs LOOP)>; + +def SKXWriteResGroup112 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort237,SKXPort0156]> { + let Latency = 7; + let NumMicroOps = 11; + let ResourceCycles = [1,4,4,2]; +} +def: InstRW<[SKXWriteResGroup112], (instrs VPSCATTERDQZ256mr, + VPSCATTERQQZ256mr, + VSCATTERDPDZ256mr, + VSCATTERQPDZ256mr)>; + +def SKXWriteResGroup113 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort237,SKXPort0156]> { + let Latency = 7; + let NumMicroOps = 19; + let ResourceCycles = [1,8,8,2]; +} +def: InstRW<[SKXWriteResGroup113], (instrs VPSCATTERDQZmr, + VPSCATTERQQZmr, + VSCATTERDPDZmr, + VSCATTERQPDZmr)>; + +def SKXWriteResGroup114 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort5,SKXPort237,SKXPort0156]> { + let Latency = 7; + let NumMicroOps = 36; + let ResourceCycles = [1,16,1,16,2]; +} +def: InstRW<[SKXWriteResGroup114], (instrs VSCATTERDPSZmr)>; + +def SKXWriteResGroup118 : SchedWriteRes<[SKXPort1,SKXPort23]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup118], (instregex "PDEP(32|64)rm", + "PEXT(32|64)rm")>; + +def SKXWriteResGroup118_16_1 : SchedWriteRes<[SKXPort1, SKXPort0156, SKXPort23]> { + let Latency = 8; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup118_16_1], (instrs IMUL16rm, IMUL16rmi, IMUL16rmi8)>; + +def SKXWriteResGroup118_16_2 : SchedWriteRes<[SKXPort1, SKXPort06, SKXPort0156, SKXPort23]> { + let Latency = 9; + let NumMicroOps = 5; + let ResourceCycles = [1,1,2,1]; +} +def: InstRW<[SKXWriteResGroup118_16_2], (instrs IMUL16m, MUL16m)>; + +def SKXWriteResGroup119 : SchedWriteRes<[SKXPort5,SKXPort23]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup119], (instregex "FCOM(P?)(32|64)m", + "VFPCLASSSDZrm(b?)", + "VPBROADCASTBYrm", + "VPBROADCASTB(Z|Z256)m(b?)", + "VPBROADCASTWYrm", + "VPBROADCASTW(Z|Z256)m(b?)", + "VPMOVSXBDYrm", + "VPMOVSXBQYrm", + "VPMOVSXWQYrm")>; + +def SKXWriteResGroup121 : SchedWriteRes<[SKXPort23,SKXPort015]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup121], (instregex "VBLENDMPD(Z|Z256)rm(b?)", + "VBLENDMPS(Z|Z256)rm(b?)", + "VBROADCASTF32X2Z256m(b?)", + "VBROADCASTF32X2Zm(b?)", + "VBROADCASTF32X4Z256rm(b?)", + "VBROADCASTF32X4rm(b?)", + "VBROADCASTF32X8rm(b?)", + "VBROADCASTF64X2Z128rm(b?)", + "VBROADCASTF64X2rm(b?)", + "VBROADCASTF64X4rm(b?)", + "VBROADCASTI32X2Z256m(b?)", + "VBROADCASTI32X2Zm(b?)", + "VBROADCASTI32X4Z256rm(b?)", + "VBROADCASTI32X4rm(b?)", + "VBROADCASTI32X8rm(b?)", + "VBROADCASTI64X2Z128rm(b?)", + "VBROADCASTI64X2rm(b?)", + "VBROADCASTI64X4rm(b?)", + "VBROADCASTSD(Z|Z256)m(b?)", + "VBROADCASTSS(Z|Z256)m(b?)", + "VINSERTF32x4(Z|Z256)rm(b?)", + "VINSERTF32x8Zrm(b?)", + "VINSERTF64x2(Z|Z256)rm(b?)", + "VINSERTF64x4Zrm(b?)", + "VINSERTI32x4(Z|Z256)rm(b?)", + "VINSERTI32x8Zrm(b?)", + "VINSERTI64x2(Z|Z256)rm(b?)", + "VINSERTI64x4Zrm(b?)", + "VMOVAPD(Z|Z256)rm(b?)", + "VMOVAPS(Z|Z256)rm(b?)", + "VMOVDDUP(Z|Z256)rm(b?)", + "VMOVDQA32(Z|Z256)rm(b?)", + "VMOVDQA64(Z|Z256)rm(b?)", + "VMOVDQU16(Z|Z256)rm(b?)", + "VMOVDQU32(Z|Z256)rm(b?)", + "VMOVDQU64(Z|Z256)rm(b?)", + "VMOVDQU8(Z|Z256)rm(b?)", + "VMOVNTDQAZ256rm(b?)", + "VMOVSHDUP(Z|Z256)rm(b?)", + "VMOVSLDUP(Z|Z256)rm(b?)", + "VMOVUPD(Z|Z256)rm(b?)", + "VMOVUPS(Z|Z256)rm(b?)", + "VPADD(B|D|Q|W)Yrm", + "VPADD(B|D|Q|W)(Z|Z256)rm(b?)", + "VPBLENDDYrmi", + "VPBLENDM(B|D|Q|W)(Z|Z256)rm(b?)", + "VPBROADCASTD(Z|Z256)m(b?)", + "VPBROADCASTQ(Z|Z256)m(b?)", + "VPSUB(B|D|Q|W)Yrm", + "VPSUB(B|D|Q|W)(Z|Z256)rm(b?)", + "VPTERNLOGD(Z|Z256)rm(b?)i", + "VPTERNLOGQ(Z|Z256)rm(b?)i")>; + +def SKXWriteResGroup123 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> { + let Latency = 8; + let NumMicroOps = 4; + let ResourceCycles = [1,2,1]; +} +def: InstRW<[SKXWriteResGroup123], (instregex "MMX_PH(ADD|SUB)SWrm")>; + +def SKXWriteResGroup126 : SchedWriteRes<[SKXPort23,SKXPort237,SKXPort06]> { + let Latency = 8; + let NumMicroOps = 5; + let ResourceCycles = [1,1,3]; +} +def: InstRW<[SKXWriteResGroup126], (instregex "ROR(8|16|32|64)mCL")>; + +def SKXWriteResGroup127 : SchedWriteRes<[SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> { + let Latency = 8; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,2]; +} +def: InstRW<[SKXWriteResGroup127], (instregex "RCL(8|16|32|64)m1", + "RCL(8|16|32|64)mi", + "RCR(8|16|32|64)m1", + "RCR(8|16|32|64)mi")>; + +def SKXWriteResGroup128 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort06]> { + let Latency = 8; + let NumMicroOps = 6; + let ResourceCycles = [1,1,1,3]; +} +def: InstRW<[SKXWriteResGroup128], (instregex "ROL(8|16|32|64)mCL", + "SAR(8|16|32|64)mCL", + "SHL(8|16|32|64)mCL", + "SHR(8|16|32|64)mCL")>; + +def SKXWriteResGroup130 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> { + let Latency = 8; + let NumMicroOps = 6; + let ResourceCycles = [1,1,1,2,1]; +} +def: SchedAlias<WriteADCRMW, SKXWriteResGroup130>; +def: InstRW<[SKXWriteResGroup130], (instregex "CMPXCHG(8|16|32|64)rm")>; + +def SKXWriteResGroup131 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort5,SKXPort237,SKXPort0156]> { + let Latency = 8; + let NumMicroOps = 8; + let ResourceCycles = [1,2,1,2,2]; +} +def: InstRW<[SKXWriteResGroup131], (instrs VPSCATTERQDZ128mr, + VPSCATTERQDZ256mr, + VSCATTERQPSZ128mr, + VSCATTERQPSZ256mr)>; + +def SKXWriteResGroup132 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort5,SKXPort237,SKXPort0156]> { + let Latency = 8; + let NumMicroOps = 12; + let ResourceCycles = [1,4,1,4,2]; +} +def: InstRW<[SKXWriteResGroup132], (instrs VPSCATTERDDZ128mr, + VSCATTERDPSZ128mr)>; + +def SKXWriteResGroup133 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort5,SKXPort237,SKXPort0156]> { + let Latency = 8; + let NumMicroOps = 20; + let ResourceCycles = [1,8,1,8,2]; +} +def: InstRW<[SKXWriteResGroup133], (instrs VPSCATTERDDZ256mr, + VSCATTERDPSZ256mr)>; + +def SKXWriteResGroup134 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort5,SKXPort237,SKXPort0156]> { + let Latency = 8; + let NumMicroOps = 36; + let ResourceCycles = [1,16,1,16,2]; +} +def: InstRW<[SKXWriteResGroup134], (instrs VPSCATTERDDZmr)>; + +def SKXWriteResGroup135 : SchedWriteRes<[SKXPort0,SKXPort23]> { + let Latency = 9; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup135], (instregex "MMX_CVTPI2PSirm")>; + +def SKXWriteResGroup136 : SchedWriteRes<[SKXPort5,SKXPort23]> { + let Latency = 9; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup136], (instregex "VALIGNDZ128rm(b?)i", + "VALIGNQZ128rm(b?)i", + "VCMPPDZ128rm(b?)i", + "VCMPPSZ128rm(b?)i", + "VCMPSDZrm", + "VCMPSSZrm", + "VFPCLASSSSZrm(b?)", + "VPCMPBZ128rmi(b?)", + "VPCMPDZ128rmi(b?)", + "VPCMPEQ(B|D|Q|W)Z128rm(b?)", + "VPCMPGT(B|D|Q|W)Z128rm(b?)", + "(V?)PCMPGTQrm", + "VPCMPQZ128rmi(b?)", + "VPCMPU(B|D|Q|W)Z128rmi(b?)", + "VPCMPWZ128rmi(b?)", + "VPERMI2D128rm(b?)", + "VPERMI2PD128rm(b?)", + "VPERMI2PS128rm(b?)", + "VPERMI2Q128rm(b?)", + "VPERMT2D128rm(b?)", + "VPERMT2PD128rm(b?)", + "VPERMT2PS128rm(b?)", + "VPERMT2Q128rm(b?)", + "VPMAXSQZ128rm(b?)", + "VPMAXUQZ128rm(b?)", + "VPMINSQZ128rm(b?)", + "VPMINUQZ128rm(b?)", + "VPMOVSXBDZ128rm(b?)", + "VPMOVSXBQZ128rm(b?)", + "VPMOVSXBWYrm", + "VPMOVSXBWZ128rm(b?)", + "VPMOVSXDQYrm", + "VPMOVSXDQZ128rm(b?)", + "VPMOVSXWDYrm", + "VPMOVSXWDZ128rm(b?)", + "VPMOVSXWQZ128rm(b?)", + "VPMOVZXBDZ128rm(b?)", + "VPMOVZXBQZ128rm(b?)", + "VPMOVZXBWZ128rm(b?)", + "VPMOVZXDQZ128rm(b?)", + "VPMOVZXWDYrm", + "VPMOVZXWDZ128rm(b?)", + "VPMOVZXWQZ128rm(b?)", + "VPTESTMBZ128rm(b?)", + "VPTESTMDZ128rm(b?)", + "VPTESTMQZ128rm(b?)", + "VPTESTMWZ128rm(b?)", + "VPTESTNMBZ128rm(b?)", + "VPTESTNMDZ128rm(b?)", + "VPTESTNMQZ128rm(b?)", + "VPTESTNMWZ128rm(b?)")>; + +def SKXWriteResGroup137 : SchedWriteRes<[SKXPort23,SKXPort015]> { + let Latency = 9; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup137], (instregex "MMX_CVT(T?)PS2PIirm", + "(V?)CVTPS2PDrm")>; + +def SKXWriteResGroup142 : SchedWriteRes<[SKXPort1,SKXPort5,SKXPort23]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup142], (instrs IMUL64m, MUL64m, MULX64rm)>; + +def SKXWriteResGroup143 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort23]> { + let Latency = 9; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[SKXWriteResGroup143], (instregex "(V?)PHADDSWrm", + "(V?)PHSUBSWrm")>; + +def SKXWriteResGroup146 : SchedWriteRes<[SKXPort1,SKXPort6,SKXPort23,SKXPort0156]> { + let Latency = 9; + let NumMicroOps = 5; + let ResourceCycles = [1,2,1,1]; +} +def: InstRW<[SKXWriteResGroup146], (instregex "LAR(16|32|64)rm", + "LSL(16|32|64)rm")>; + +def SKXWriteResGroup148 : SchedWriteRes<[SKXPort5,SKXPort23]> { + let Latency = 10; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup148], (instregex "(ADD|SUB|SUBR)_F(32|64)m", + "ILD_F(16|32|64)m", + "VALIGND(Z|Z256)rm(b?)i", + "VALIGNQ(Z|Z256)rm(b?)i", + "VCMPPD(Z|Z256)rm(b?)i", + "VCMPPS(Z|Z256)rm(b?)i", + "VPCMPB(Z|Z256)rmi(b?)", + "VPCMPD(Z|Z256)rmi(b?)", + "VPCMPEQB(Z|Z256)rm(b?)", + "VPCMPEQD(Z|Z256)rm(b?)", + "VPCMPEQQ(Z|Z256)rm(b?)", + "VPCMPEQW(Z|Z256)rm(b?)", + "VPCMPGTB(Z|Z256)rm(b?)", + "VPCMPGTD(Z|Z256)rm(b?)", + "VPCMPGTQYrm", + "VPCMPGTQ(Z|Z256)rm(b?)", + "VPCMPGTW(Z|Z256)rm(b?)", + "VPCMPQ(Z|Z256)rmi(b?)", + "VPCMPU(B|D|Q|W)Z256rmi(b?)", + "VPCMPU(B|D|Q|W)Zrmi(b?)", + "VPCMPW(Z|Z256)rmi(b?)", + "VPMAXSQ(Z|Z256)rm(b?)", + "VPMAXUQ(Z|Z256)rm(b?)", + "VPMINSQ(Z|Z256)rm(b?)", + "VPMINUQ(Z|Z256)rm(b?)", + "VPTESTM(B|D|Q|W)Z256rm(b?)", + "VPTESTM(B|D|Q|W)Zrm(b?)", + "VPTESTNM(B|D|Q|W)Z256rm(b?)", + "VPTESTNM(B|D|Q|W)Zrm(b?)")>; + +def SKXWriteResGroup149 : SchedWriteRes<[SKXPort23,SKXPort015]> { + let Latency = 10; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup149], (instregex "VCVTDQ2PDZ128rm(b?)", + "VCVTDQ2PSZ128rm(b?)", + "(V?)CVTDQ2PSrm", + "VCVTPD2QQZ128rm(b?)", + "VCVTPD2UQQZ128rm(b?)", + "VCVTPH2PSZ128rm(b?)", + "VCVTPS2DQZ128rm(b?)", + "(V?)CVTPS2DQrm", + "VCVTPS2PDZ128rm(b?)", + "VCVTPS2QQZ128rm(b?)", + "VCVTPS2UDQZ128rm(b?)", + "VCVTPS2UQQZ128rm(b?)", + "VCVTQQ2PDZ128rm(b?)", + "VCVTQQ2PSZ128rm(b?)", + "VCVTSS2SDZrm", + "(V?)CVTSS2SDrm", + "VCVTTPD2QQZ128rm(b?)", + "VCVTTPD2UQQZ128rm(b?)", + "VCVTTPS2DQZ128rm(b?)", + "(V?)CVTTPS2DQrm", + "VCVTTPS2QQZ128rm(b?)", + "VCVTTPS2UDQZ128rm(b?)", + "VCVTTPS2UQQZ128rm(b?)", + "VCVTUDQ2PDZ128rm(b?)", + "VCVTUDQ2PSZ128rm(b?)", + "VCVTUQQ2PDZ128rm(b?)", + "VCVTUQQ2PSZ128rm(b?)")>; + +def SKXWriteResGroup151 : SchedWriteRes<[SKXPort5,SKXPort23]> { + let Latency = 10; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKXWriteResGroup151], (instregex "VEXPANDPDZ128rm(b?)", + "VEXPANDPSZ128rm(b?)", + "VPEXPANDDZ128rm(b?)", + "VPEXPANDQZ128rm(b?)")>; + +def SKXWriteResGroup153 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> { + let Latency = 10; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup153], (instregex "(V?)CVTSD2SSrm")>; + +def SKXWriteResGroup154 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort23]> { + let Latency = 10; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[SKXWriteResGroup154], (instregex "VPHADDSWYrm", + "VPHSUBSWYrm")>; + +def SKXWriteResGroup156 : SchedWriteRes<[SKXPort1,SKXPort23,SKXPort06,SKXPort0156]> { + let Latency = 9; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKXWriteResGroup156], (instrs IMUL32m, MUL32m, MULX32rm)>; + +def SKXWriteResGroup157 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> { + let Latency = 10; + let NumMicroOps = 8; + let ResourceCycles = [1,1,1,1,1,3]; +} +def: InstRW<[SKXWriteResGroup157], (instregex "XCHG(8|16|32|64)rm")>; + +def SKXWriteResGroup159 : SchedWriteRes<[SKXPort0,SKXFPDivider]> { + let Latency = 11; + let NumMicroOps = 1; + let ResourceCycles = [1,3]; +} +def : SchedAlias<WriteFDivX, SKXWriteResGroup159>; // TODO - convert to ZnWriteResFpuPair + +def SKXWriteResGroup160 : SchedWriteRes<[SKXPort0,SKXPort23]> { + let Latency = 11; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup160], (instregex "MUL_F(32|64)m")>; + +def SKXWriteResGroup161 : SchedWriteRes<[SKXPort23,SKXPort015]> { + let Latency = 11; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTDQ2PD(Z|Z256)rm(b?)", + "VCVTDQ2PSYrm", + "VCVTDQ2PS(Z|Z256)rm(b?)", + "VCVTPH2PS(Z|Z256)rm(b?)", + "VCVTPS2PDYrm", + "VCVTPS2PD(Z|Z256)rm(b?)", + "VCVTQQ2PD(Z|Z256)rm(b?)", + "VCVTQQ2PSZ256rm(b?)", + "VCVT(T?)PD2QQ(Z|Z256)rm(b?)", + "VCVT(T?)PD2UQQ(Z|Z256)rm(b?)", + "VCVT(T?)PS2DQYrm", + "VCVT(T?)PS2DQ(Z|Z256)rm(b?)", + "VCVT(T?)PS2QQZ256rm(b?)", + "VCVT(T?)PS2UDQ(Z|Z256)rm(b?)", + "VCVT(T?)PS2UQQZ256rm(b?)", + "VCVTUDQ2PD(Z|Z256)rm(b?)", + "VCVTUDQ2PS(Z|Z256)rm(b?)", + "VCVTUQQ2PD(Z|Z256)rm(b?)", + "VCVTUQQ2PSZ256rm(b?)")>; + +def SKXWriteResGroup162 : SchedWriteRes<[SKXPort5,SKXPort23]> { + let Latency = 11; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKXWriteResGroup162], (instregex "FICOM(P?)(16|32)m", + "VEXPANDPD(Z|Z256)rm(b?)", + "VEXPANDPS(Z|Z256)rm(b?)", + "VPEXPANDD(Z|Z256)rm(b?)", + "VPEXPANDQ(Z|Z256)rm(b?)")>; + +def SKXWriteResGroup163 : SchedWriteRes<[SKXPort23,SKXPort015]> { + let Latency = 11; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKXWriteResGroup163], (instregex "VCVTSD2SSZrm")>; + +def SKXWriteResGroup164 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> { + let Latency = 11; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup164], (instregex "(V?)CVTDQ2PDrm")>; + +def SKXWriteResGroup166 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> { + let Latency = 11; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup166], (instregex "CVTPD2PSrm", + "CVT(T?)PD2DQrm", + "MMX_CVT(T?)PD2PIirm")>; + +def SKXWriteResGroup167 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> { + let Latency = 11; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[SKXWriteResGroup167], (instregex "VPCONFLICTQZ128rm(b?)")>; + +def SKXWriteResGroup169 : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort0156]> { + let Latency = 11; + let NumMicroOps = 7; + let ResourceCycles = [2,3,2]; +} +def: InstRW<[SKXWriteResGroup169], (instregex "RCL(16|32|64)rCL", + "RCR(16|32|64)rCL")>; + +def SKXWriteResGroup170 : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort15,SKXPort0156]> { + let Latency = 11; + let NumMicroOps = 9; + let ResourceCycles = [1,5,1,2]; +} +def: InstRW<[SKXWriteResGroup170], (instregex "RCL8rCL")>; + +def SKXWriteResGroup171 : SchedWriteRes<[SKXPort06,SKXPort0156]> { + let Latency = 11; + let NumMicroOps = 11; + let ResourceCycles = [2,9]; +} +def: InstRW<[SKXWriteResGroup171], (instrs LOOPE, LOOPNE)>; + +def SKXWriteResGroup174 : SchedWriteRes<[SKXPort01]> { + let Latency = 12; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def: InstRW<[SKXWriteResGroup174], (instregex "VPMULLQ(Z128|Z256)rr")>; + +def SKXWriteResGroup174z : SchedWriteRes<[SKXPort05]> { + let Latency = 12; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def: InstRW<[SKXWriteResGroup174z], (instregex "VPMULLQZrr")>; + +def SKXWriteResGroup175 : SchedWriteRes<[SKXPort5,SKXPort23]> { + let Latency = 12; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKXWriteResGroup175], (instregex "VPERMWZ128rm(b?)")>; + +def SKXWriteResGroup176 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015]> { + let Latency = 12; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup176], (instregex "VCVT(T?)SD2USIZrm(b?)", + "VCVT(T?)SS2USI64Zrm(b?)")>; + +def SKXWriteResGroup177 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> { + let Latency = 12; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup177], (instregex "VCVT(T?)PS2QQZrm(b?)", + "VCVT(T?)PS2UQQZrm(b?)")>; + +def SKXWriteResGroup179 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23,SKXPort015]> { + let Latency = 12; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKXWriteResGroup179], (instregex "CVTTSS2SI64rm")>; + +def SKXWriteResGroup180 : SchedWriteRes<[SKXPort5,SKXPort23]> { + let Latency = 13; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKXWriteResGroup180], (instregex "(ADD|SUB|SUBR)_FI(16|32)m", + "VPERMWZ256rm(b?)", + "VPERMWZrm(b?)")>; + +def SKXWriteResGroup181 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> { + let Latency = 13; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup181], (instregex "VCVTDQ2PDYrm")>; + +def SKXWriteResGroup183 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> { + let Latency = 13; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[SKXWriteResGroup183], (instregex "VPERMI2W128rm(b?)", + "VPERMT2W128rm(b?)")>; + +def SKXWriteResGroup184 : SchedWriteRes<[SKXPort0,SKXFPDivider]> { + let Latency = 14; + let NumMicroOps = 1; + let ResourceCycles = [1,3]; +} +def : SchedAlias<WriteFDiv64, SKXWriteResGroup184>; // TODO - convert to ZnWriteResFpuPair +def : SchedAlias<WriteFDiv64X, SKXWriteResGroup184>; // TODO - convert to ZnWriteResFpuPair + +def SKXWriteResGroup184_1 : SchedWriteRes<[SKXPort0,SKXFPDivider]> { + let Latency = 14; + let NumMicroOps = 1; + let ResourceCycles = [1,5]; +} +def : SchedAlias<WriteFDiv64Y, SKXWriteResGroup184_1>; // TODO - convert to ZnWriteResFpuPair + +def SKXWriteResGroup187 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> { + let Latency = 14; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup187], (instregex "MUL_FI(16|32)m")>; + +def SKXWriteResGroup188 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> { + let Latency = 14; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup188], (instregex "VCVTPD2DQZrm(b?)", + "VCVTPD2PSZrm(b?)", + "VCVTPD2UDQZrm(b?)", + "VCVTQQ2PSZrm(b?)", + "VCVTTPD2DQZrm(b?)", + "VCVTTPD2UDQZrm(b?)", + "VCVTUQQ2PSZrm(b?)")>; + +def SKXWriteResGroup189 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> { + let Latency = 14; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[SKXWriteResGroup189], (instregex "VPERMI2W256rm(b?)", + "VPERMI2Wrm(b?)", + "VPERMT2W256rm(b?)", + "VPERMT2Wrm(b?)")>; + +def SKXWriteResGroup190 : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort15,SKXPort0156]> { + let Latency = 14; + let NumMicroOps = 10; + let ResourceCycles = [2,4,1,3]; +} +def: InstRW<[SKXWriteResGroup190], (instregex "RCR8rCL")>; + +def SKXWriteResGroup191 : SchedWriteRes<[SKXPort0]> { + let Latency = 15; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup191], (instregex "DIVR_(FPrST0|FST0r|FrST0)")>; + +def SKXWriteResGroup194 : SchedWriteRes<[SKXPort1,SKXPort5,SKXPort01,SKXPort23,SKXPort015]> { + let Latency = 15; + let NumMicroOps = 8; + let ResourceCycles = [1,2,2,1,2]; +} +def: InstRW<[SKXWriteResGroup194], (instregex "VPCONFLICTDZ128rm(b?)")>; + +def SKXWriteResGroup195 : SchedWriteRes<[SKXPort1,SKXPort23,SKXPort237,SKXPort06,SKXPort15,SKXPort0156]> { + let Latency = 15; + let NumMicroOps = 10; + let ResourceCycles = [1,1,1,5,1,1]; +} +def: InstRW<[SKXWriteResGroup195], (instregex "RCL(8|16|32|64)mCL")>; + +def SKXWriteResGroup199 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort06,SKXPort15,SKXPort0156]> { + let Latency = 16; + let NumMicroOps = 14; + let ResourceCycles = [1,1,1,4,2,5]; +} +def: InstRW<[SKXWriteResGroup199], (instrs CMPXCHG8B)>; + +def SKXWriteResGroup200 : SchedWriteRes<[SKXPort0156]> { + let Latency = 16; + let NumMicroOps = 16; + let ResourceCycles = [16]; +} +def: InstRW<[SKXWriteResGroup200], (instrs VZEROALL)>; + +def SKXWriteResGroup201 : SchedWriteRes<[SKXPort0,SKXPort23,SKXFPDivider]> { + let Latency = 17; + let NumMicroOps = 2; + let ResourceCycles = [1,1,5]; +} +def : SchedAlias<WriteFDivXLd, SKXWriteResGroup201>; // TODO - convert to ZnWriteResFpuPair + +def SKXWriteResGroup202 : SchedWriteRes<[SKXPort0,SKXPort1,SKXPort5,SKXPort6,SKXPort05,SKXPort0156]> { + let Latency = 17; + let NumMicroOps = 15; + let ResourceCycles = [2,1,2,4,2,4]; +} +def: InstRW<[SKXWriteResGroup202], (instrs XCH_F)>; + +def SKXWriteResGroup205 : SchedWriteRes<[SKXPort23,SKXPort015]> { + let Latency = 18; + let NumMicroOps = 4; + let ResourceCycles = [1,3]; +} +def: InstRW<[SKXWriteResGroup205], (instregex "VPMULLQZ128rm(b?)")>; + +def SKXWriteResGroup207 : SchedWriteRes<[SKXPort5,SKXPort6,SKXPort06,SKXPort0156]> { + let Latency = 18; + let NumMicroOps = 8; + let ResourceCycles = [1,1,1,5]; +} +def: InstRW<[SKXWriteResGroup207], (instrs CPUID, RDTSC)>; + +def SKXWriteResGroup208 : SchedWriteRes<[SKXPort1,SKXPort23,SKXPort237,SKXPort06,SKXPort15,SKXPort0156]> { + let Latency = 18; + let NumMicroOps = 11; + let ResourceCycles = [2,1,1,4,1,2]; +} +def: InstRW<[SKXWriteResGroup208], (instregex "RCR(8|16|32|64)mCL")>; + +def SKXWriteResGroup209 : SchedWriteRes<[SKXPort0,SKXPort23,SKXFPDivider]> { + let Latency = 19; + let NumMicroOps = 2; + let ResourceCycles = [1,1,4]; +} +def : SchedAlias<WriteFDiv64Ld, SKXWriteResGroup209>; // TODO - convert to ZnWriteResFpuPair + +def SKXWriteResGroup211 : SchedWriteRes<[SKXPort23,SKXPort015]> { + let Latency = 19; + let NumMicroOps = 4; + let ResourceCycles = [1,3]; +} +def: InstRW<[SKXWriteResGroup211], (instregex "VPMULLQZ256rm(b?)", + "VPMULLQZrm(b?)")>; + +def SKXWriteResGroup214 : SchedWriteRes<[]> { + let Latency = 20; + let NumMicroOps = 0; +} +def: InstRW<[SKXWriteResGroup214], (instrs VGATHERDPSZ128rm, + VGATHERQPSZrm, + VPGATHERDDZ128rm)>; + +def SKXWriteResGroup215 : SchedWriteRes<[SKXPort0]> { + let Latency = 20; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup215], (instregex "DIV_(FPrST0|FST0r|FrST0)")>; + +def SKXWriteResGroup216 : SchedWriteRes<[SKXPort0,SKXPort23,SKXFPDivider]> { + let Latency = 20; + let NumMicroOps = 2; + let ResourceCycles = [1,1,4]; +} +def : SchedAlias<WriteFDiv64XLd, SKXWriteResGroup216>; // TODO - convert to ZnWriteResFpuPair + +def SKXWriteResGroup218 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> { + let Latency = 20; + let NumMicroOps = 5; + let ResourceCycles = [1,2,1,1]; +} +def: InstRW<[SKXWriteResGroup218], (instrs VGATHERQPSZ128rm, + VGATHERQPSZ256rm, + VPGATHERQDZ128rm, + VPGATHERQDZ256rm)>; + +def SKXWriteResGroup219 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort6,SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> { + let Latency = 20; + let NumMicroOps = 8; + let ResourceCycles = [1,1,1,1,1,1,2]; +} +def: InstRW<[SKXWriteResGroup219], (instrs INSB, INSL, INSW)>; + +def SKXWriteResGroup220 : SchedWriteRes<[SKXPort5,SKXPort6,SKXPort0156]> { + let Latency = 20; + let NumMicroOps = 10; + let ResourceCycles = [1,2,7]; +} +def: InstRW<[SKXWriteResGroup220], (instrs MWAITrr)>; + +def SKXWriteResGroup222 : SchedWriteRes<[SKXPort0,SKXPort23,SKXFPDivider]> { + let Latency = 21; + let NumMicroOps = 2; + let ResourceCycles = [1,1,8]; +} +def : SchedAlias<WriteFDiv64YLd, SKXWriteResGroup222>; // TODO - convert to ZnWriteResFpuPair + +def SKXWriteResGroup223 : SchedWriteRes<[SKXPort0,SKXPort23]> { + let Latency = 22; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup223], (instregex "DIV_F(32|64)m")>; + +def SKXWriteResGroup224 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> { + let Latency = 22; + let NumMicroOps = 5; + let ResourceCycles = [1,2,1,1]; +} +def: InstRW<[SKXWriteResGroup224], (instrs VGATHERDPDZ128rm, + VGATHERQPDZ128rm, + VPGATHERDQZ128rm, + VPGATHERQQZ128rm)>; + +def SKXWriteResGroup224_2 : SchedWriteRes<[SKXPort0, SKXPort23, SKXPort5, SKXPort015]> { + let Latency = 22; + let NumMicroOps = 5; + let ResourceCycles = [1,2,1,1]; +} +def: InstRW<[SKXWriteResGroup224_2], (instrs VGATHERDPSrm, + VGATHERDPDrm, + VGATHERQPDrm, + VGATHERQPSrm, + VPGATHERDDrm, + VPGATHERDQrm, + VPGATHERQDrm, + VPGATHERQQrm, + VPGATHERDDrm, + VPGATHERQDrm, + VPGATHERDQrm, + VPGATHERQQrm, + VGATHERDPSrm, + VGATHERQPSrm, + VGATHERDPDrm, + VGATHERQPDrm)>; + +def SKXWriteResGroup224_3 : SchedWriteRes<[SKXPort0, SKXPort23, SKXPort5, SKXPort015]> { + let Latency = 25; + let NumMicroOps = 5; + let ResourceCycles = [1,2,1,1]; +} +def: InstRW<[SKXWriteResGroup224_3], (instrs VGATHERDPSYrm, + VGATHERQPDYrm, + VGATHERQPSYrm, + VPGATHERDDYrm, + VPGATHERDQYrm, + VPGATHERQDYrm, + VPGATHERQQYrm, + VPGATHERDDYrm, + VPGATHERQDYrm, + VPGATHERDQYrm, + VPGATHERQQYrm, + VGATHERDPSYrm, + VGATHERQPSYrm, + VGATHERDPDYrm)>; + +def SKXWriteResGroup225 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort015]> { + let Latency = 22; + let NumMicroOps = 14; + let ResourceCycles = [5,5,4]; +} +def: InstRW<[SKXWriteResGroup225], (instregex "VPCONFLICTDZ128rr", + "VPCONFLICTQZ256rr")>; + +def SKXWriteResGroup228 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort5,SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> { + let Latency = 23; + let NumMicroOps = 19; + let ResourceCycles = [2,1,4,1,1,4,6]; +} +def: InstRW<[SKXWriteResGroup228], (instrs CMPXCHG16B)>; + +def SKXWriteResGroup233 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> { + let Latency = 25; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup233], (instregex "DIV_FI(16|32)m")>; + +def SKXWriteResGroup234 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> { + let Latency = 25; + let NumMicroOps = 5; + let ResourceCycles = [1,2,1,1]; +} +def: InstRW<[SKXWriteResGroup234], (instrs VGATHERDPDZ256rm, + VGATHERQPDZ256rm, + VPGATHERDQZ256rm, + VPGATHERQDZrm, + VPGATHERQQZ256rm)>; + +def SKXWriteResGroup238 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> { + let Latency = 26; + let NumMicroOps = 5; + let ResourceCycles = [1,2,1,1]; +} +def: InstRW<[SKXWriteResGroup238], (instrs VGATHERDPDZrm, + VGATHERQPDZrm, + VPGATHERDQZrm, + VPGATHERQQZrm)>; + +def SKXWriteResGroup239 : SchedWriteRes<[SKXPort0,SKXPort23]> { + let Latency = 27; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup239], (instregex "DIVR_F(32|64)m")>; + +def SKXWriteResGroup240 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> { + let Latency = 27; + let NumMicroOps = 5; + let ResourceCycles = [1,2,1,1]; +} +def: InstRW<[SKXWriteResGroup240], (instrs VGATHERDPSZ256rm, + VPGATHERDDZ256rm)>; + +def SKXWriteResGroup241 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23,SKXPort0156]> { + let Latency = 28; + let NumMicroOps = 8; + let ResourceCycles = [2,4,1,1]; +} +def: InstRW<[SKXWriteResGroup241], (instregex "IDIV(8|16|32|64)m")>; + +def SKXWriteResGroup242 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort23,SKXPort015]> { + let Latency = 29; + let NumMicroOps = 15; + let ResourceCycles = [5,5,1,4]; +} +def: InstRW<[SKXWriteResGroup242], (instregex "VPCONFLICTQZ256rm(b?)")>; + +def SKXWriteResGroup243 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> { + let Latency = 30; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup243], (instregex "DIVR_FI(16|32)m")>; + +def SKXWriteResGroup245 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> { + let Latency = 30; + let NumMicroOps = 5; + let ResourceCycles = [1,2,1,1]; +} +def: InstRW<[SKXWriteResGroup245], (instrs VGATHERDPSZrm, + VPGATHERDDZrm)>; + +def SKXWriteResGroup247 : SchedWriteRes<[SKXPort5,SKXPort6,SKXPort23,SKXPort06,SKXPort0156]> { + let Latency = 35; + let NumMicroOps = 23; + let ResourceCycles = [1,5,3,4,10]; +} +def: InstRW<[SKXWriteResGroup247], (instregex "IN(8|16|32)ri", + "IN(8|16|32)rr")>; + +def SKXWriteResGroup248 : SchedWriteRes<[SKXPort5,SKXPort6,SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> { + let Latency = 35; + let NumMicroOps = 23; + let ResourceCycles = [1,5,2,1,4,10]; +} +def: InstRW<[SKXWriteResGroup248], (instregex "OUT(8|16|32)ir", + "OUT(8|16|32)rr")>; + +def SKXWriteResGroup249 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort015]> { + let Latency = 37; + let NumMicroOps = 21; + let ResourceCycles = [9,7,5]; +} +def: InstRW<[SKXWriteResGroup249], (instregex "VPCONFLICTDZ256rr", + "VPCONFLICTQZrr")>; + +def SKXWriteResGroup250 : SchedWriteRes<[SKXPort1,SKXPort6,SKXPort23,SKXPort0156]> { + let Latency = 37; + let NumMicroOps = 31; + let ResourceCycles = [1,8,1,21]; +} +def: InstRW<[SKXWriteResGroup250], (instregex "XRSTOR(64)?")>; + +def SKXWriteResGroup252 : SchedWriteRes<[SKXPort1,SKXPort4,SKXPort5,SKXPort6,SKXPort23,SKXPort237,SKXPort15,SKXPort0156]> { + let Latency = 40; + let NumMicroOps = 18; + let ResourceCycles = [1,1,2,3,1,1,1,8]; +} +def: InstRW<[SKXWriteResGroup252], (instrs VMCLEARm)>; + +def SKXWriteResGroup253 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort23,SKXPort237,SKXPort0156]> { + let Latency = 41; + let NumMicroOps = 39; + let ResourceCycles = [1,10,1,1,26]; +} +def: InstRW<[SKXWriteResGroup253], (instrs XSAVE64)>; + +def SKXWriteResGroup254 : SchedWriteRes<[SKXPort5,SKXPort0156]> { + let Latency = 42; + let NumMicroOps = 22; + let ResourceCycles = [2,20]; +} +def: InstRW<[SKXWriteResGroup254], (instrs RDTSCP)>; + +def SKXWriteResGroup255 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort23,SKXPort237,SKXPort0156]> { + let Latency = 42; + let NumMicroOps = 40; + let ResourceCycles = [1,11,1,1,26]; +} +def: InstRW<[SKXWriteResGroup255], (instrs XSAVE)>; +def: InstRW<[SKXWriteResGroup255], (instregex "XSAVEC", "XSAVES", "XSAVEOPT")>; + +def SKXWriteResGroup256 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort23,SKXPort015]> { + let Latency = 44; + let NumMicroOps = 22; + let ResourceCycles = [9,7,1,5]; +} +def: InstRW<[SKXWriteResGroup256], (instregex "VPCONFLICTDZ256rm(b?)", + "VPCONFLICTQZrm(b?)")>; + +def SKXWriteResGroup258 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort05,SKXPort06,SKXPort0156]> { + let Latency = 62; + let NumMicroOps = 64; + let ResourceCycles = [2,8,5,10,39]; +} +def: InstRW<[SKXWriteResGroup258], (instrs FLDENVm)>; + +def SKXWriteResGroup259 : SchedWriteRes<[SKXPort0,SKXPort6,SKXPort23,SKXPort05,SKXPort06,SKXPort15,SKXPort0156]> { + let Latency = 63; + let NumMicroOps = 88; + let ResourceCycles = [4,4,31,1,2,1,45]; +} +def: InstRW<[SKXWriteResGroup259], (instrs FXRSTOR64)>; + +def SKXWriteResGroup260 : SchedWriteRes<[SKXPort0,SKXPort6,SKXPort23,SKXPort05,SKXPort06,SKXPort15,SKXPort0156]> { + let Latency = 63; + let NumMicroOps = 90; + let ResourceCycles = [4,2,33,1,2,1,47]; +} +def: InstRW<[SKXWriteResGroup260], (instrs FXRSTOR)>; + +def SKXWriteResGroup261 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort015]> { + let Latency = 67; + let NumMicroOps = 35; + let ResourceCycles = [17,11,7]; +} +def: InstRW<[SKXWriteResGroup261], (instregex "VPCONFLICTDZrr")>; + +def SKXWriteResGroup262 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort23,SKXPort015]> { + let Latency = 74; + let NumMicroOps = 36; + let ResourceCycles = [17,11,1,7]; +} +def: InstRW<[SKXWriteResGroup262], (instregex "VPCONFLICTDZrm(b?)")>; + +def SKXWriteResGroup263 : SchedWriteRes<[SKXPort5,SKXPort05,SKXPort0156]> { + let Latency = 75; + let NumMicroOps = 15; + let ResourceCycles = [6,3,6]; +} +def: InstRW<[SKXWriteResGroup263], (instrs FNINIT)>; + +def SKXWriteResGroup264 : SchedWriteRes<[SKXPort0,SKXPort1,SKXPort5,SKXPort6,SKXPort05,SKXPort0156]> { + let Latency = 76; + let NumMicroOps = 32; + let ResourceCycles = [7,2,8,3,1,11]; +} +def: InstRW<[SKXWriteResGroup264], (instregex "DIV(16|32|64)r")>; + +def SKXWriteResGroup265 : SchedWriteRes<[SKXPort0,SKXPort1,SKXPort5,SKXPort6,SKXPort06,SKXPort0156]> { + let Latency = 102; + let NumMicroOps = 66; + let ResourceCycles = [4,2,4,8,14,34]; +} +def: InstRW<[SKXWriteResGroup265], (instregex "IDIV(16|32|64)r")>; + +def SKXWriteResGroup266 : SchedWriteRes<[SKXPort0,SKXPort1,SKXPort4,SKXPort5,SKXPort6,SKXPort237,SKXPort06,SKXPort0156]> { + let Latency = 106; + let NumMicroOps = 100; + let ResourceCycles = [9,1,11,16,1,11,21,30]; +} +def: InstRW<[SKXWriteResGroup266], (instrs FSTENVm)>; + +def SKXWriteResGroup267 : SchedWriteRes<[SKXPort6,SKXPort0156]> { + let Latency = 140; + let NumMicroOps = 4; + let ResourceCycles = [1,3]; +} +def: InstRW<[SKXWriteResGroup267], (instrs PAUSE)>; + +def: InstRW<[WriteZero], (instrs CLC)>; + +} // SchedModel diff --git a/capstone/suite/synctools/tablegen/X86/back/X86Schedule.td b/capstone/suite/synctools/tablegen/X86/back/X86Schedule.td new file mode 100644 index 000000000..ef9ce9470 --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86Schedule.td @@ -0,0 +1,661 @@ +//===-- X86Schedule.td - X86 Scheduling Definitions --------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// InstrSchedModel annotations for out-of-order CPUs. + +// Instructions with folded loads need to read the memory operand immediately, +// but other register operands don't have to be read until the load is ready. +// These operands are marked with ReadAfterLd. +def ReadAfterLd : SchedRead; + +// Instructions with both a load and a store folded are modeled as a folded +// load + WriteRMW. +def WriteRMW : SchedWrite; + +// Helper to set SchedWrite ExePorts/Latency/ResourceCycles/NumMicroOps. +multiclass X86WriteRes<SchedWrite SchedRW, + list<ProcResourceKind> ExePorts, + int Lat, list<int> Res, int UOps> { + def : WriteRes<SchedRW, ExePorts> { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } +} + +// Most instructions can fold loads, so almost every SchedWrite comes in two +// variants: With and without a folded load. +// An X86FoldableSchedWrite holds a reference to the corresponding SchedWrite +// with a folded load. +class X86FoldableSchedWrite : SchedWrite { + // The SchedWrite to use when a load is folded into the instruction. + SchedWrite Folded; +} + +// Multiclass that produces a linked pair of SchedWrites. +multiclass X86SchedWritePair { + // Register-Memory operation. + def Ld : SchedWrite; + // Register-Register operation. + def NAME : X86FoldableSchedWrite { + let Folded = !cast<SchedWrite>(NAME#"Ld"); + } +} + +// Helpers to mark SchedWrites as unsupported. +multiclass X86WriteResUnsupported<SchedWrite SchedRW> { + let Unsupported = 1 in { + def : WriteRes<SchedRW, []>; + } +} +multiclass X86WriteResPairUnsupported<X86FoldableSchedWrite SchedRW> { + let Unsupported = 1 in { + def : WriteRes<SchedRW, []>; + def : WriteRes<SchedRW.Folded, []>; + } +} + +// Multiclass that wraps X86FoldableSchedWrite for each vector width. +class X86SchedWriteWidths<X86FoldableSchedWrite sScl, + X86FoldableSchedWrite s128, + X86FoldableSchedWrite s256, + X86FoldableSchedWrite s512> { + X86FoldableSchedWrite Scl = sScl; // Scalar float/double operations. + X86FoldableSchedWrite MMX = sScl; // MMX operations. + X86FoldableSchedWrite XMM = s128; // XMM operations. + X86FoldableSchedWrite YMM = s256; // YMM operations. + X86FoldableSchedWrite ZMM = s512; // ZMM operations. +} + +// Multiclass that wraps X86SchedWriteWidths for each fp vector type. +class X86SchedWriteSizes<X86SchedWriteWidths sPS, + X86SchedWriteWidths sPD> { + X86SchedWriteWidths PS = sPS; + X86SchedWriteWidths PD = sPD; +} + +// Multiclass that wraps move/load/store triple for a vector width. +class X86SchedWriteMoveLS<SchedWrite MoveRR, + SchedWrite LoadRM, + SchedWrite StoreMR> { + SchedWrite RR = MoveRR; + SchedWrite RM = LoadRM; + SchedWrite MR = StoreMR; +} + +// Multiclass that wraps X86SchedWriteMoveLS for each vector width. +class X86SchedWriteMoveLSWidths<X86SchedWriteMoveLS sScl, + X86SchedWriteMoveLS s128, + X86SchedWriteMoveLS s256, + X86SchedWriteMoveLS s512> { + X86SchedWriteMoveLS Scl = sScl; // Scalar float/double operations. + X86SchedWriteMoveLS MMX = sScl; // MMX operations. + X86SchedWriteMoveLS XMM = s128; // XMM operations. + X86SchedWriteMoveLS YMM = s256; // YMM operations. + X86SchedWriteMoveLS ZMM = s512; // ZMM operations. +} + +// Loads, stores, and moves, not folded with other operations. +def WriteLoad : SchedWrite; +def WriteStore : SchedWrite; +def WriteStoreNT : SchedWrite; +def WriteMove : SchedWrite; + +// Arithmetic. +defm WriteALU : X86SchedWritePair; // Simple integer ALU op. +defm WriteADC : X86SchedWritePair; // Integer ALU + flags op. +def WriteALURMW : WriteSequence<[WriteALULd, WriteStore]>; +def WriteADCRMW : WriteSequence<[WriteADCLd, WriteStore]>; +defm WriteIMul : X86SchedWritePair; // Integer multiplication. +defm WriteIMul64 : X86SchedWritePair; // Integer 64-bit multiplication. +def WriteIMulH : SchedWrite; // Integer multiplication, high part. +def WriteLEA : SchedWrite; // LEA instructions can't fold loads. + +def WriteBSWAP32 : SchedWrite; // Byte Order (Endianness) 32-bit Swap. +def WriteBSWAP64 : SchedWrite; // Byte Order (Endianness) 64-bit Swap. + +// Integer division. +defm WriteDiv8 : X86SchedWritePair; +defm WriteDiv16 : X86SchedWritePair; +defm WriteDiv32 : X86SchedWritePair; +defm WriteDiv64 : X86SchedWritePair; +defm WriteIDiv8 : X86SchedWritePair; +defm WriteIDiv16 : X86SchedWritePair; +defm WriteIDiv32 : X86SchedWritePair; +defm WriteIDiv64 : X86SchedWritePair; + +defm WriteBSF : X86SchedWritePair; // Bit scan forward. +defm WriteBSR : X86SchedWritePair; // Bit scan reverse. +defm WritePOPCNT : X86SchedWritePair; // Bit population count. +defm WriteLZCNT : X86SchedWritePair; // Leading zero count. +defm WriteTZCNT : X86SchedWritePair; // Trailing zero count. +defm WriteCMOV : X86SchedWritePair; // Conditional move. +defm WriteCMOV2 : X86SchedWritePair; // Conditional (CF + ZF flag) move. +def WriteFCMOV : SchedWrite; // X87 conditional move. +def WriteSETCC : SchedWrite; // Set register based on condition code. +def WriteSETCCStore : SchedWrite; +def WriteLAHFSAHF : SchedWrite; // Load/Store flags in AH. +def WriteBitTest : SchedWrite; // Bit Test - TODO add memory folding support + +// Integer shifts and rotates. +defm WriteShift : X86SchedWritePair; +// Double shift instructions. +def WriteSHDrri : SchedWrite; +def WriteSHDrrcl : SchedWrite; +def WriteSHDmri : SchedWrite; +def WriteSHDmrcl : SchedWrite; + +// BMI1 BEXTR, BMI2 BZHI +defm WriteBEXTR : X86SchedWritePair; +defm WriteBZHI : X86SchedWritePair; + +// Idioms that clear a register, like xorps %xmm0, %xmm0. +// These can often bypass execution ports completely. +def WriteZero : SchedWrite; + +// Branches don't produce values, so they have no latency, but they still +// consume resources. Indirect branches can fold loads. +defm WriteJump : X86SchedWritePair; + +// Floating point. This covers both scalar and vector operations. +def WriteFLD0 : SchedWrite; +def WriteFLD1 : SchedWrite; +def WriteFLDC : SchedWrite; +def WriteFLoad : SchedWrite; +def WriteFLoadX : SchedWrite; +def WriteFLoadY : SchedWrite; +def WriteFMaskedLoad : SchedWrite; +def WriteFMaskedLoadY : SchedWrite; +def WriteFStore : SchedWrite; +def WriteFStoreX : SchedWrite; +def WriteFStoreY : SchedWrite; +def WriteFStoreNT : SchedWrite; +def WriteFStoreNTX : SchedWrite; +def WriteFStoreNTY : SchedWrite; +def WriteFMaskedStore : SchedWrite; +def WriteFMaskedStoreY : SchedWrite; +def WriteFMove : SchedWrite; +def WriteFMoveX : SchedWrite; +def WriteFMoveY : SchedWrite; + +defm WriteFAdd : X86SchedWritePair; // Floating point add/sub. +defm WriteFAddX : X86SchedWritePair; // Floating point add/sub (XMM). +defm WriteFAddY : X86SchedWritePair; // Floating point add/sub (YMM). +defm WriteFAddZ : X86SchedWritePair; // Floating point add/sub (ZMM). +defm WriteFAdd64 : X86SchedWritePair; // Floating point double add/sub. +defm WriteFAdd64X : X86SchedWritePair; // Floating point double add/sub (XMM). +defm WriteFAdd64Y : X86SchedWritePair; // Floating point double add/sub (YMM). +defm WriteFAdd64Z : X86SchedWritePair; // Floating point double add/sub (ZMM). +defm WriteFCmp : X86SchedWritePair; // Floating point compare. +defm WriteFCmpX : X86SchedWritePair; // Floating point compare (XMM). +defm WriteFCmpY : X86SchedWritePair; // Floating point compare (YMM). +defm WriteFCmpZ : X86SchedWritePair; // Floating point compare (ZMM). +defm WriteFCmp64 : X86SchedWritePair; // Floating point double compare. +defm WriteFCmp64X : X86SchedWritePair; // Floating point double compare (XMM). +defm WriteFCmp64Y : X86SchedWritePair; // Floating point double compare (YMM). +defm WriteFCmp64Z : X86SchedWritePair; // Floating point double compare (ZMM). +defm WriteFCom : X86SchedWritePair; // Floating point compare to flags. +defm WriteFMul : X86SchedWritePair; // Floating point multiplication. +defm WriteFMulX : X86SchedWritePair; // Floating point multiplication (XMM). +defm WriteFMulY : X86SchedWritePair; // Floating point multiplication (YMM). +defm WriteFMulZ : X86SchedWritePair; // Floating point multiplication (YMM). +defm WriteFMul64 : X86SchedWritePair; // Floating point double multiplication. +defm WriteFMul64X : X86SchedWritePair; // Floating point double multiplication (XMM). +defm WriteFMul64Y : X86SchedWritePair; // Floating point double multiplication (YMM). +defm WriteFMul64Z : X86SchedWritePair; // Floating point double multiplication (ZMM). +defm WriteFDiv : X86SchedWritePair; // Floating point division. +defm WriteFDivX : X86SchedWritePair; // Floating point division (XMM). +defm WriteFDivY : X86SchedWritePair; // Floating point division (YMM). +defm WriteFDivZ : X86SchedWritePair; // Floating point division (ZMM). +defm WriteFDiv64 : X86SchedWritePair; // Floating point double division. +defm WriteFDiv64X : X86SchedWritePair; // Floating point double division (XMM). +defm WriteFDiv64Y : X86SchedWritePair; // Floating point double division (YMM). +defm WriteFDiv64Z : X86SchedWritePair; // Floating point double division (ZMM). +defm WriteFSqrt : X86SchedWritePair; // Floating point square root. +defm WriteFSqrtX : X86SchedWritePair; // Floating point square root (XMM). +defm WriteFSqrtY : X86SchedWritePair; // Floating point square root (YMM). +defm WriteFSqrtZ : X86SchedWritePair; // Floating point square root (ZMM). +defm WriteFSqrt64 : X86SchedWritePair; // Floating point double square root. +defm WriteFSqrt64X : X86SchedWritePair; // Floating point double square root (XMM). +defm WriteFSqrt64Y : X86SchedWritePair; // Floating point double square root (YMM). +defm WriteFSqrt64Z : X86SchedWritePair; // Floating point double square root (ZMM). +defm WriteFSqrt80 : X86SchedWritePair; // Floating point long double square root. +defm WriteFRcp : X86SchedWritePair; // Floating point reciprocal estimate. +defm WriteFRcpX : X86SchedWritePair; // Floating point reciprocal estimate (XMM). +defm WriteFRcpY : X86SchedWritePair; // Floating point reciprocal estimate (YMM). +defm WriteFRcpZ : X86SchedWritePair; // Floating point reciprocal estimate (ZMM). +defm WriteFRsqrt : X86SchedWritePair; // Floating point reciprocal square root estimate. +defm WriteFRsqrtX: X86SchedWritePair; // Floating point reciprocal square root estimate (XMM). +defm WriteFRsqrtY: X86SchedWritePair; // Floating point reciprocal square root estimate (YMM). +defm WriteFRsqrtZ: X86SchedWritePair; // Floating point reciprocal square root estimate (ZMM). +defm WriteFMA : X86SchedWritePair; // Fused Multiply Add. +defm WriteFMAX : X86SchedWritePair; // Fused Multiply Add (XMM). +defm WriteFMAY : X86SchedWritePair; // Fused Multiply Add (YMM). +defm WriteFMAZ : X86SchedWritePair; // Fused Multiply Add (ZMM). +defm WriteDPPD : X86SchedWritePair; // Floating point double dot product. +defm WriteDPPS : X86SchedWritePair; // Floating point single dot product. +defm WriteDPPSY : X86SchedWritePair; // Floating point single dot product (YMM). +defm WriteDPPSZ : X86SchedWritePair; // Floating point single dot product (ZMM). +defm WriteFSign : X86SchedWritePair; // Floating point fabs/fchs. +defm WriteFRnd : X86SchedWritePair; // Floating point rounding. +defm WriteFRndY : X86SchedWritePair; // Floating point rounding (YMM). +defm WriteFRndZ : X86SchedWritePair; // Floating point rounding (ZMM). +defm WriteFLogic : X86SchedWritePair; // Floating point and/or/xor logicals. +defm WriteFLogicY : X86SchedWritePair; // Floating point and/or/xor logicals (YMM). +defm WriteFLogicZ : X86SchedWritePair; // Floating point and/or/xor logicals (ZMM). +defm WriteFTest : X86SchedWritePair; // Floating point TEST instructions. +defm WriteFTestY : X86SchedWritePair; // Floating point TEST instructions (YMM). +defm WriteFTestZ : X86SchedWritePair; // Floating point TEST instructions (ZMM). +defm WriteFShuffle : X86SchedWritePair; // Floating point vector shuffles. +defm WriteFShuffleY : X86SchedWritePair; // Floating point vector shuffles (YMM). +defm WriteFShuffleZ : X86SchedWritePair; // Floating point vector shuffles (ZMM). +defm WriteFVarShuffle : X86SchedWritePair; // Floating point vector variable shuffles. +defm WriteFVarShuffleY : X86SchedWritePair; // Floating point vector variable shuffles (YMM). +defm WriteFVarShuffleZ : X86SchedWritePair; // Floating point vector variable shuffles (ZMM). +defm WriteFBlend : X86SchedWritePair; // Floating point vector blends. +defm WriteFBlendY : X86SchedWritePair; // Floating point vector blends (YMM). +defm WriteFBlendZ : X86SchedWritePair; // Floating point vector blends (ZMM). +defm WriteFVarBlend : X86SchedWritePair; // Fp vector variable blends. +defm WriteFVarBlendY : X86SchedWritePair; // Fp vector variable blends (YMM). +defm WriteFVarBlendZ : X86SchedWritePair; // Fp vector variable blends (YMZMM). + +// FMA Scheduling helper class. +class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; } + +// Horizontal Add/Sub (float and integer) +defm WriteFHAdd : X86SchedWritePair; +defm WriteFHAddY : X86SchedWritePair; +defm WriteFHAddZ : X86SchedWritePair; +defm WritePHAdd : X86SchedWritePair; +defm WritePHAddX : X86SchedWritePair; +defm WritePHAddY : X86SchedWritePair; +defm WritePHAddZ : X86SchedWritePair; + +// Vector integer operations. +def WriteVecLoad : SchedWrite; +def WriteVecLoadX : SchedWrite; +def WriteVecLoadY : SchedWrite; +def WriteVecLoadNT : SchedWrite; +def WriteVecLoadNTY : SchedWrite; +def WriteVecMaskedLoad : SchedWrite; +def WriteVecMaskedLoadY : SchedWrite; +def WriteVecStore : SchedWrite; +def WriteVecStoreX : SchedWrite; +def WriteVecStoreY : SchedWrite; +def WriteVecStoreNT : SchedWrite; +def WriteVecStoreNTY : SchedWrite; +def WriteVecMaskedStore : SchedWrite; +def WriteVecMaskedStoreY : SchedWrite; +def WriteVecMove : SchedWrite; +def WriteVecMoveX : SchedWrite; +def WriteVecMoveY : SchedWrite; +def WriteVecMoveToGpr : SchedWrite; +def WriteVecMoveFromGpr : SchedWrite; + +defm WriteVecALU : X86SchedWritePair; // Vector integer ALU op, no logicals. +defm WriteVecALUX : X86SchedWritePair; // Vector integer ALU op, no logicals (XMM). +defm WriteVecALUY : X86SchedWritePair; // Vector integer ALU op, no logicals (YMM). +defm WriteVecALUZ : X86SchedWritePair; // Vector integer ALU op, no logicals (ZMM). +defm WriteVecLogic : X86SchedWritePair; // Vector integer and/or/xor logicals. +defm WriteVecLogicX : X86SchedWritePair; // Vector integer and/or/xor logicals (XMM). +defm WriteVecLogicY : X86SchedWritePair; // Vector integer and/or/xor logicals (YMM). +defm WriteVecLogicZ : X86SchedWritePair; // Vector integer and/or/xor logicals (ZMM). +defm WriteVecTest : X86SchedWritePair; // Vector integer TEST instructions. +defm WriteVecTestY : X86SchedWritePair; // Vector integer TEST instructions (YMM). +defm WriteVecTestZ : X86SchedWritePair; // Vector integer TEST instructions (ZMM). +defm WriteVecShift : X86SchedWritePair; // Vector integer shifts (default). +defm WriteVecShiftX : X86SchedWritePair; // Vector integer shifts (XMM). +defm WriteVecShiftY : X86SchedWritePair; // Vector integer shifts (YMM). +defm WriteVecShiftZ : X86SchedWritePair; // Vector integer shifts (ZMM). +defm WriteVecShiftImm : X86SchedWritePair; // Vector integer immediate shifts (default). +defm WriteVecShiftImmX: X86SchedWritePair; // Vector integer immediate shifts (XMM). +defm WriteVecShiftImmY: X86SchedWritePair; // Vector integer immediate shifts (YMM). +defm WriteVecShiftImmZ: X86SchedWritePair; // Vector integer immediate shifts (ZMM). +defm WriteVecIMul : X86SchedWritePair; // Vector integer multiply (default). +defm WriteVecIMulX : X86SchedWritePair; // Vector integer multiply (XMM). +defm WriteVecIMulY : X86SchedWritePair; // Vector integer multiply (YMM). +defm WriteVecIMulZ : X86SchedWritePair; // Vector integer multiply (ZMM). +defm WritePMULLD : X86SchedWritePair; // Vector PMULLD. +defm WritePMULLDY : X86SchedWritePair; // Vector PMULLD (YMM). +defm WritePMULLDZ : X86SchedWritePair; // Vector PMULLD (ZMM). +defm WriteShuffle : X86SchedWritePair; // Vector shuffles. +defm WriteShuffleX : X86SchedWritePair; // Vector shuffles (XMM). +defm WriteShuffleY : X86SchedWritePair; // Vector shuffles (YMM). +defm WriteShuffleZ : X86SchedWritePair; // Vector shuffles (ZMM). +defm WriteVarShuffle : X86SchedWritePair; // Vector variable shuffles. +defm WriteVarShuffleX : X86SchedWritePair; // Vector variable shuffles (XMM). +defm WriteVarShuffleY : X86SchedWritePair; // Vector variable shuffles (YMM). +defm WriteVarShuffleZ : X86SchedWritePair; // Vector variable shuffles (ZMM). +defm WriteBlend : X86SchedWritePair; // Vector blends. +defm WriteBlendY : X86SchedWritePair; // Vector blends (YMM). +defm WriteBlendZ : X86SchedWritePair; // Vector blends (ZMM). +defm WriteVarBlend : X86SchedWritePair; // Vector variable blends. +defm WriteVarBlendY : X86SchedWritePair; // Vector variable blends (YMM). +defm WriteVarBlendZ : X86SchedWritePair; // Vector variable blends (ZMM). +defm WritePSADBW : X86SchedWritePair; // Vector PSADBW. +defm WritePSADBWX : X86SchedWritePair; // Vector PSADBW (XMM). +defm WritePSADBWY : X86SchedWritePair; // Vector PSADBW (YMM). +defm WritePSADBWZ : X86SchedWritePair; // Vector PSADBW (ZMM). +defm WriteMPSAD : X86SchedWritePair; // Vector MPSAD. +defm WriteMPSADY : X86SchedWritePair; // Vector MPSAD (YMM). +defm WriteMPSADZ : X86SchedWritePair; // Vector MPSAD (ZMM). +defm WritePHMINPOS : X86SchedWritePair; // Vector PHMINPOS. + +// Vector insert/extract operations. +defm WriteVecInsert : X86SchedWritePair; // Insert gpr to vector element. +def WriteVecExtract : SchedWrite; // Extract vector element to gpr. +def WriteVecExtractSt : SchedWrite; // Extract vector element and store. + +// MOVMSK operations. +def WriteFMOVMSK : SchedWrite; +def WriteVecMOVMSK : SchedWrite; +def WriteVecMOVMSKY : SchedWrite; +def WriteMMXMOVMSK : SchedWrite; + +// Conversion between integer and float. +defm WriteCvtSD2I : X86SchedWritePair; // Double -> Integer. +defm WriteCvtPD2I : X86SchedWritePair; // Double -> Integer (XMM). +defm WriteCvtPD2IY : X86SchedWritePair; // Double -> Integer (YMM). +defm WriteCvtPD2IZ : X86SchedWritePair; // Double -> Integer (ZMM). + +defm WriteCvtSS2I : X86SchedWritePair; // Float -> Integer. +defm WriteCvtPS2I : X86SchedWritePair; // Float -> Integer (XMM). +defm WriteCvtPS2IY : X86SchedWritePair; // Float -> Integer (YMM). +defm WriteCvtPS2IZ : X86SchedWritePair; // Float -> Integer (ZMM). + +defm WriteCvtI2SD : X86SchedWritePair; // Integer -> Double. +defm WriteCvtI2PD : X86SchedWritePair; // Integer -> Double (XMM). +defm WriteCvtI2PDY : X86SchedWritePair; // Integer -> Double (YMM). +defm WriteCvtI2PDZ : X86SchedWritePair; // Integer -> Double (ZMM). + +defm WriteCvtI2SS : X86SchedWritePair; // Integer -> Float. +defm WriteCvtI2PS : X86SchedWritePair; // Integer -> Float (XMM). +defm WriteCvtI2PSY : X86SchedWritePair; // Integer -> Float (YMM). +defm WriteCvtI2PSZ : X86SchedWritePair; // Integer -> Float (ZMM). + +defm WriteCvtSS2SD : X86SchedWritePair; // Float -> Double size conversion. +defm WriteCvtPS2PD : X86SchedWritePair; // Float -> Double size conversion (XMM). +defm WriteCvtPS2PDY : X86SchedWritePair; // Float -> Double size conversion (YMM). +defm WriteCvtPS2PDZ : X86SchedWritePair; // Float -> Double size conversion (ZMM). + +defm WriteCvtSD2SS : X86SchedWritePair; // Double -> Float size conversion. +defm WriteCvtPD2PS : X86SchedWritePair; // Double -> Float size conversion (XMM). +defm WriteCvtPD2PSY : X86SchedWritePair; // Double -> Float size conversion (YMM). +defm WriteCvtPD2PSZ : X86SchedWritePair; // Double -> Float size conversion (ZMM). + +defm WriteCvtPH2PS : X86SchedWritePair; // Half -> Float size conversion. +defm WriteCvtPH2PSY : X86SchedWritePair; // Half -> Float size conversion (YMM). +defm WriteCvtPH2PSZ : X86SchedWritePair; // Half -> Float size conversion (ZMM). + +def WriteCvtPS2PH : SchedWrite; // // Float -> Half size conversion. +def WriteCvtPS2PHY : SchedWrite; // // Float -> Half size conversion (YMM). +def WriteCvtPS2PHZ : SchedWrite; // // Float -> Half size conversion (ZMM). +def WriteCvtPS2PHSt : SchedWrite; // // Float -> Half + store size conversion. +def WriteCvtPS2PHYSt : SchedWrite; // // Float -> Half + store size conversion (YMM). +def WriteCvtPS2PHZSt : SchedWrite; // // Float -> Half + store size conversion (ZMM). + +// CRC32 instruction. +defm WriteCRC32 : X86SchedWritePair; + +// Strings instructions. +// Packed Compare Implicit Length Strings, Return Mask +defm WritePCmpIStrM : X86SchedWritePair; +// Packed Compare Explicit Length Strings, Return Mask +defm WritePCmpEStrM : X86SchedWritePair; +// Packed Compare Implicit Length Strings, Return Index +defm WritePCmpIStrI : X86SchedWritePair; +// Packed Compare Explicit Length Strings, Return Index +defm WritePCmpEStrI : X86SchedWritePair; + +// AES instructions. +defm WriteAESDecEnc : X86SchedWritePair; // Decryption, encryption. +defm WriteAESIMC : X86SchedWritePair; // InvMixColumn. +defm WriteAESKeyGen : X86SchedWritePair; // Key Generation. + +// Carry-less multiplication instructions. +defm WriteCLMul : X86SchedWritePair; + +// EMMS/FEMMS +def WriteEMMS : SchedWrite; + +// Load/store MXCSR +def WriteLDMXCSR : SchedWrite; +def WriteSTMXCSR : SchedWrite; + +// Catch-all for expensive system instructions. +def WriteSystem : SchedWrite; + +// AVX2. +defm WriteFShuffle256 : X86SchedWritePair; // Fp 256-bit width vector shuffles. +defm WriteFVarShuffle256 : X86SchedWritePair; // Fp 256-bit width variable shuffles. +defm WriteShuffle256 : X86SchedWritePair; // 256-bit width vector shuffles. +defm WriteVarShuffle256 : X86SchedWritePair; // 256-bit width vector variable shuffles. +defm WriteVarVecShift : X86SchedWritePair; // Variable vector shifts. +defm WriteVarVecShiftY : X86SchedWritePair; // Variable vector shifts (YMM). +defm WriteVarVecShiftZ : X86SchedWritePair; // Variable vector shifts (ZMM). + +// Old microcoded instructions that nobody use. +def WriteMicrocoded : SchedWrite; + +// Fence instructions. +def WriteFence : SchedWrite; + +// Nop, not very useful expect it provides a model for nops! +def WriteNop : SchedWrite; + +// Move/Load/Store wrappers. +def WriteFMoveLS + : X86SchedWriteMoveLS<WriteFMove, WriteFLoad, WriteFStore>; +def WriteFMoveLSX + : X86SchedWriteMoveLS<WriteFMoveX, WriteFLoadX, WriteFStoreX>; +def WriteFMoveLSY + : X86SchedWriteMoveLS<WriteFMoveY, WriteFLoadY, WriteFStoreY>; +def SchedWriteFMoveLS + : X86SchedWriteMoveLSWidths<WriteFMoveLS, WriteFMoveLSX, + WriteFMoveLSY, WriteFMoveLSY>; + +def WriteFMoveLSNT + : X86SchedWriteMoveLS<WriteFMove, WriteFLoad, WriteFStoreNT>; +def WriteFMoveLSNTX + : X86SchedWriteMoveLS<WriteFMove, WriteFLoad, WriteFStoreNTX>; +def WriteFMoveLSNTY + : X86SchedWriteMoveLS<WriteFMoveY, WriteFLoadY, WriteFStoreNTY>; +def SchedWriteFMoveLSNT + : X86SchedWriteMoveLSWidths<WriteFMoveLSNT, WriteFMoveLSNTX, + WriteFMoveLSNTY, WriteFMoveLSNTY>; + +def WriteVecMoveLS + : X86SchedWriteMoveLS<WriteVecMove, WriteVecLoad, WriteVecStore>; +def WriteVecMoveLSX + : X86SchedWriteMoveLS<WriteVecMoveX, WriteVecLoadX, WriteVecStoreX>; +def WriteVecMoveLSY + : X86SchedWriteMoveLS<WriteVecMoveY, WriteVecLoadY, WriteVecStoreY>; +def SchedWriteVecMoveLS + : X86SchedWriteMoveLSWidths<WriteVecMoveLS, WriteVecMoveLSX, + WriteVecMoveLSY, WriteVecMoveLSY>; + +def WriteVecMoveLSNT + : X86SchedWriteMoveLS<WriteVecMove, WriteVecLoadNT, WriteVecStoreNT>; +def WriteVecMoveLSNTX + : X86SchedWriteMoveLS<WriteVecMoveX, WriteVecLoadNT, WriteVecStoreNT>; +def WriteVecMoveLSNTY + : X86SchedWriteMoveLS<WriteVecMoveY, WriteVecLoadNTY, WriteVecStoreNTY>; +def SchedWriteVecMoveLSNT + : X86SchedWriteMoveLSWidths<WriteVecMoveLSNT, WriteVecMoveLSNTX, + WriteVecMoveLSNTY, WriteVecMoveLSNTY>; + +// Vector width wrappers. +def SchedWriteFAdd + : X86SchedWriteWidths<WriteFAdd, WriteFAddX, WriteFAddY, WriteFAddZ>; +def SchedWriteFAdd64 + : X86SchedWriteWidths<WriteFAdd64, WriteFAdd64X, WriteFAdd64Y, WriteFAdd64Z>; +def SchedWriteFHAdd + : X86SchedWriteWidths<WriteFHAdd, WriteFHAdd, WriteFHAddY, WriteFHAddZ>; +def SchedWriteFCmp + : X86SchedWriteWidths<WriteFCmp, WriteFCmpX, WriteFCmpY, WriteFCmpZ>; +def SchedWriteFCmp64 + : X86SchedWriteWidths<WriteFCmp64, WriteFCmp64X, WriteFCmp64Y, WriteFCmp64Z>; +def SchedWriteFMul + : X86SchedWriteWidths<WriteFMul, WriteFMulX, WriteFMulY, WriteFMulZ>; +def SchedWriteFMul64 + : X86SchedWriteWidths<WriteFMul64, WriteFMul64X, WriteFMul64Y, WriteFMul64Z>; +def SchedWriteFMA + : X86SchedWriteWidths<WriteFMA, WriteFMAX, WriteFMAY, WriteFMAZ>; +def SchedWriteDPPD + : X86SchedWriteWidths<WriteDPPD, WriteDPPD, WriteDPPD, WriteDPPD>; +def SchedWriteDPPS + : X86SchedWriteWidths<WriteDPPS, WriteDPPS, WriteDPPSY, WriteDPPSZ>; +def SchedWriteFDiv + : X86SchedWriteWidths<WriteFDiv, WriteFDivX, WriteFDivY, WriteFDivZ>; +def SchedWriteFDiv64 + : X86SchedWriteWidths<WriteFDiv64, WriteFDiv64X, WriteFDiv64Y, WriteFDiv64Z>; +def SchedWriteFSqrt + : X86SchedWriteWidths<WriteFSqrt, WriteFSqrtX, + WriteFSqrtY, WriteFSqrtZ>; +def SchedWriteFSqrt64 + : X86SchedWriteWidths<WriteFSqrt64, WriteFSqrt64X, + WriteFSqrt64Y, WriteFSqrt64Z>; +def SchedWriteFRcp + : X86SchedWriteWidths<WriteFRcp, WriteFRcpX, WriteFRcpY, WriteFRcpZ>; +def SchedWriteFRsqrt + : X86SchedWriteWidths<WriteFRsqrt, WriteFRsqrtX, WriteFRsqrtY, WriteFRsqrtZ>; +def SchedWriteFRnd + : X86SchedWriteWidths<WriteFRnd, WriteFRnd, WriteFRndY, WriteFRndZ>; +def SchedWriteFLogic + : X86SchedWriteWidths<WriteFLogic, WriteFLogic, WriteFLogicY, WriteFLogicZ>; +def SchedWriteFTest + : X86SchedWriteWidths<WriteFTest, WriteFTest, WriteFTestY, WriteFTestZ>; + +def SchedWriteFShuffle + : X86SchedWriteWidths<WriteFShuffle, WriteFShuffle, + WriteFShuffleY, WriteFShuffleZ>; +def SchedWriteFVarShuffle + : X86SchedWriteWidths<WriteFVarShuffle, WriteFVarShuffle, + WriteFVarShuffleY, WriteFVarShuffleZ>; +def SchedWriteFBlend + : X86SchedWriteWidths<WriteFBlend, WriteFBlend, WriteFBlendY, WriteFBlendZ>; +def SchedWriteFVarBlend + : X86SchedWriteWidths<WriteFVarBlend, WriteFVarBlend, + WriteFVarBlendY, WriteFVarBlendZ>; + +def SchedWriteCvtDQ2PD + : X86SchedWriteWidths<WriteCvtI2SD, WriteCvtI2PD, + WriteCvtI2PDY, WriteCvtI2PDZ>; +def SchedWriteCvtDQ2PS + : X86SchedWriteWidths<WriteCvtI2SS, WriteCvtI2PS, + WriteCvtI2PSY, WriteCvtI2PSZ>; +def SchedWriteCvtPD2DQ + : X86SchedWriteWidths<WriteCvtSD2I, WriteCvtPD2I, + WriteCvtPD2IY, WriteCvtPD2IZ>; +def SchedWriteCvtPS2DQ + : X86SchedWriteWidths<WriteCvtSS2I, WriteCvtPS2I, + WriteCvtPS2IY, WriteCvtPS2IZ>; +def SchedWriteCvtPS2PD + : X86SchedWriteWidths<WriteCvtSS2SD, WriteCvtPS2PD, + WriteCvtPS2PDY, WriteCvtPS2PDZ>; +def SchedWriteCvtPD2PS + : X86SchedWriteWidths<WriteCvtSD2SS, WriteCvtPD2PS, + WriteCvtPD2PSY, WriteCvtPD2PSZ>; + +def SchedWriteVecALU + : X86SchedWriteWidths<WriteVecALU, WriteVecALUX, WriteVecALUY, WriteVecALUZ>; +def SchedWritePHAdd + : X86SchedWriteWidths<WritePHAdd, WritePHAddX, WritePHAddY, WritePHAddZ>; +def SchedWriteVecLogic + : X86SchedWriteWidths<WriteVecLogic, WriteVecLogicX, + WriteVecLogicY, WriteVecLogicZ>; +def SchedWriteVecTest + : X86SchedWriteWidths<WriteVecTest, WriteVecTest, + WriteVecTestY, WriteVecTestZ>; +def SchedWriteVecShift + : X86SchedWriteWidths<WriteVecShift, WriteVecShiftX, + WriteVecShiftY, WriteVecShiftZ>; +def SchedWriteVecShiftImm + : X86SchedWriteWidths<WriteVecShiftImm, WriteVecShiftImmX, + WriteVecShiftImmY, WriteVecShiftImmZ>; +def SchedWriteVarVecShift + : X86SchedWriteWidths<WriteVarVecShift, WriteVarVecShift, + WriteVarVecShiftY, WriteVarVecShiftZ>; +def SchedWriteVecIMul + : X86SchedWriteWidths<WriteVecIMul, WriteVecIMulX, + WriteVecIMulY, WriteVecIMulZ>; +def SchedWritePMULLD + : X86SchedWriteWidths<WritePMULLD, WritePMULLD, + WritePMULLDY, WritePMULLDZ>; +def SchedWriteMPSAD + : X86SchedWriteWidths<WriteMPSAD, WriteMPSAD, + WriteMPSADY, WriteMPSADZ>; +def SchedWritePSADBW + : X86SchedWriteWidths<WritePSADBW, WritePSADBWX, + WritePSADBWY, WritePSADBWZ>; + +def SchedWriteShuffle + : X86SchedWriteWidths<WriteShuffle, WriteShuffleX, + WriteShuffleY, WriteShuffleZ>; +def SchedWriteVarShuffle + : X86SchedWriteWidths<WriteVarShuffle, WriteVarShuffleX, + WriteVarShuffleY, WriteVarShuffleZ>; +def SchedWriteBlend + : X86SchedWriteWidths<WriteBlend, WriteBlend, WriteBlendY, WriteBlendZ>; +def SchedWriteVarBlend + : X86SchedWriteWidths<WriteVarBlend, WriteVarBlend, + WriteVarBlendY, WriteVarBlendZ>; + +// Vector size wrappers. +def SchedWriteFAddSizes + : X86SchedWriteSizes<SchedWriteFAdd, SchedWriteFAdd64>; +def SchedWriteFCmpSizes + : X86SchedWriteSizes<SchedWriteFCmp, SchedWriteFCmp64>; +def SchedWriteFMulSizes + : X86SchedWriteSizes<SchedWriteFMul, SchedWriteFMul64>; +def SchedWriteFDivSizes + : X86SchedWriteSizes<SchedWriteFDiv, SchedWriteFDiv64>; +def SchedWriteFSqrtSizes + : X86SchedWriteSizes<SchedWriteFSqrt, SchedWriteFSqrt64>; +def SchedWriteFLogicSizes + : X86SchedWriteSizes<SchedWriteFLogic, SchedWriteFLogic>; +def SchedWriteFShuffleSizes + : X86SchedWriteSizes<SchedWriteFShuffle, SchedWriteFShuffle>; + +//===----------------------------------------------------------------------===// +// Generic Processor Scheduler Models. + +// IssueWidth is analogous to the number of decode units. Core and its +// descendents, including Nehalem and SandyBridge have 4 decoders. +// Resources beyond the decoder operate on micro-ops and are bufferred +// so adjacent micro-ops don't directly compete. +// +// MicroOpBufferSize > 1 indicates that RAW dependencies can be +// decoded in the same cycle. The value 32 is a reasonably arbitrary +// number of in-flight instructions. +// +// HighLatency=10 is optimistic. X86InstrInfo::isHighLatencyDef +// indicates high latency opcodes. Alternatively, InstrItinData +// entries may be included here to define specific operand +// latencies. Since these latencies are not used for pipeline hazards, +// they do not need to be exact. +// +// The GenericX86Model contains no instruction schedules +// and disables PostRAScheduler. +class GenericX86Model : SchedMachineModel { + let IssueWidth = 4; + let MicroOpBufferSize = 32; + let LoadLatency = 4; + let HighLatency = 10; + let PostRAScheduler = 0; + let CompleteModel = 0; +} + +def GenericModel : GenericX86Model; + +// Define a model with the PostRAScheduler enabled. +def GenericPostRAModel : GenericX86Model { + let PostRAScheduler = 1; +} diff --git a/capstone/suite/synctools/tablegen/X86/back/X86ScheduleAtom.td b/capstone/suite/synctools/tablegen/X86/back/X86ScheduleAtom.td new file mode 100644 index 000000000..a7f461c45 --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86ScheduleAtom.td @@ -0,0 +1,917 @@ +//===- X86ScheduleAtom.td - X86 Atom Scheduling Definitions -*- tablegen -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the schedule class data for the Intel Atom +// in order (Saltwell-32nm/Bonnell-45nm) processors. +// +//===----------------------------------------------------------------------===// + +// +// Scheduling information derived from the "Intel 64 and IA32 Architectures +// Optimization Reference Manual", Chapter 13, Section 4. + +// Atom machine model. +def AtomModel : SchedMachineModel { + let IssueWidth = 2; // Allows 2 instructions per scheduling group. + let MicroOpBufferSize = 0; // In-order execution, always hide latency. + let LoadLatency = 3; // Expected cycles, may be overriden. + let HighLatency = 30;// Expected, may be overriden. + + // On the Atom, the throughput for taken branches is 2 cycles. For small + // simple loops, expand by a small factor to hide the backedge cost. + let LoopMicroOpBufferSize = 10; + let PostRAScheduler = 1; + let CompleteModel = 0; +} + +let SchedModel = AtomModel in { + +// Functional Units +def AtomPort0 : ProcResource<1>; // ALU: ALU0, shift/rotate, load/store + // SIMD/FP: SIMD ALU, Shuffle,SIMD/FP multiply, divide +def AtomPort1 : ProcResource<1>; // ALU: ALU1, bit processing, jump, and LEA + // SIMD/FP: SIMD ALU, FP Adder + +def AtomPort01 : ProcResGroup<[AtomPort0, AtomPort1]>; + +// Loads are 3 cycles, so ReadAfterLd registers needn't be available until 3 +// cycles after the memory operand. +def : ReadAdvance<ReadAfterLd, 3>; + +// Many SchedWrites are defined in pairs with and without a folded load. +// Instructions with folded loads are usually micro-fused, so they only appear +// as two micro-ops when dispatched by the schedulers. +// This multiclass defines the resource usage for variants with and without +// folded loads. +multiclass AtomWriteResPair<X86FoldableSchedWrite SchedRW, + list<ProcResourceKind> RRPorts, + list<ProcResourceKind> RMPorts, + int RRLat = 1, int RMLat = 1, + list<int> RRRes = [1], + list<int> RMRes = [1]> { + // Register variant is using a single cycle on ExePort. + def : WriteRes<SchedRW, RRPorts> { + let Latency = RRLat; + let ResourceCycles = RRRes; + } + + // Memory variant also uses a cycle on JLAGU and adds 3 cycles to the + // latency. + def : WriteRes<SchedRW.Folded, RMPorts> { + let Latency = RMLat; + let ResourceCycles = RMRes; + } +} + +// A folded store needs a cycle on Port0 for the store data. +def : WriteRes<WriteRMW, [AtomPort0]>; + +//////////////////////////////////////////////////////////////////////////////// +// Arithmetic. +//////////////////////////////////////////////////////////////////////////////// + +defm : AtomWriteResPair<WriteALU, [AtomPort01], [AtomPort0]>; +defm : AtomWriteResPair<WriteADC, [AtomPort01], [AtomPort0]>; +defm : AtomWriteResPair<WriteIMul, [AtomPort01], [AtomPort01], 7, 7, [7], [7]>; +defm : AtomWriteResPair<WriteIMul64, [AtomPort01], [AtomPort01], 12, 12, [12], [12]>; + +defm : X86WriteRes<WriteBSWAP32, [AtomPort0], 1, [1], 1>; +defm : X86WriteRes<WriteBSWAP64, [AtomPort0], 1, [1], 1>; + +defm : AtomWriteResPair<WriteDiv8, [AtomPort01], [AtomPort01], 50, 68, [50], [68]>; +defm : AtomWriteResPair<WriteDiv16, [AtomPort01], [AtomPort01], 50, 50, [50], [50]>; +defm : AtomWriteResPair<WriteDiv32, [AtomPort01], [AtomPort01], 50, 50, [50], [50]>; +defm : AtomWriteResPair<WriteDiv64, [AtomPort01], [AtomPort01],130,130,[130],[130]>; +defm : AtomWriteResPair<WriteIDiv8, [AtomPort01], [AtomPort01], 62, 62, [62], [62]>; +defm : AtomWriteResPair<WriteIDiv16, [AtomPort01], [AtomPort01], 62, 62, [62], [62]>; +defm : AtomWriteResPair<WriteIDiv32, [AtomPort01], [AtomPort01], 62, 62, [62], [62]>; +defm : AtomWriteResPair<WriteIDiv64, [AtomPort01], [AtomPort01],130,130,[130],[130]>; + +defm : X86WriteResPairUnsupported<WriteCRC32>; + +defm : AtomWriteResPair<WriteCMOV, [AtomPort01], [AtomPort0]>; +defm : AtomWriteResPair<WriteCMOV2, [AtomPort01], [AtomPort0]>; +defm : X86WriteRes<WriteFCMOV, [AtomPort01], 9, [9], 1>; // x87 conditional move. + +def : WriteRes<WriteSETCC, [AtomPort01]>; +def : WriteRes<WriteSETCCStore, [AtomPort01]> { + let Latency = 2; + let ResourceCycles = [2]; +} +def : WriteRes<WriteLAHFSAHF, [AtomPort01]> { + let Latency = 2; + let ResourceCycles = [2]; +} +def : WriteRes<WriteBitTest,[AtomPort01]>; + +defm : X86WriteResUnsupported<WriteIMulH>; + +// This is for simple LEAs with one or two input operands. +def : WriteRes<WriteLEA, [AtomPort1]>; + +def AtomWriteIMul16Ld : SchedWriteRes<[AtomPort01]> { + let Latency = 8; + let ResourceCycles = [8]; +} +def : InstRW<[AtomWriteIMul16Ld], (instrs MUL16m, IMUL16m)>; + +def AtomWriteIMul32 : SchedWriteRes<[AtomPort01]> { + let Latency = 6; + let ResourceCycles = [6]; +} +def : InstRW<[AtomWriteIMul32], (instrs MUL32r, IMUL32r)>; + +def AtomWriteIMul64I : SchedWriteRes<[AtomPort01]> { + let Latency = 14; + let ResourceCycles = [14]; +} +def : InstRW<[AtomWriteIMul64I], (instrs IMUL64rri8, IMUL64rri32, + IMUL64rmi8, IMUL64rmi32)>; + +// Bit counts. +defm : AtomWriteResPair<WriteBSF, [AtomPort01], [AtomPort01], 16, 16, [16], [16]>; +defm : AtomWriteResPair<WriteBSR, [AtomPort01], [AtomPort01], 16, 16, [16], [16]>; +defm : X86WriteResPairUnsupported<WritePOPCNT>; +defm : X86WriteResPairUnsupported<WriteLZCNT>; +defm : X86WriteResPairUnsupported<WriteTZCNT>; + +// BMI1 BEXTR, BMI2 BZHI +defm : X86WriteResPairUnsupported<WriteBEXTR>; +defm : X86WriteResPairUnsupported<WriteBZHI>; + +//////////////////////////////////////////////////////////////////////////////// +// Integer shifts and rotates. +//////////////////////////////////////////////////////////////////////////////// + +defm : AtomWriteResPair<WriteShift, [AtomPort0], [AtomPort0]>; + +defm : X86WriteRes<WriteSHDrri, [AtomPort01], 2, [2], 1>; +defm : X86WriteRes<WriteSHDrrcl,[AtomPort01], 2, [2], 1>; +defm : X86WriteRes<WriteSHDmri, [AtomPort01], 4, [4], 1>; +defm : X86WriteRes<WriteSHDmrcl,[AtomPort01], 4, [4], 1>; + +//////////////////////////////////////////////////////////////////////////////// +// Loads, stores, and moves, not folded with other operations. +//////////////////////////////////////////////////////////////////////////////// + +def : WriteRes<WriteLoad, [AtomPort0]>; +def : WriteRes<WriteStore, [AtomPort0]>; +def : WriteRes<WriteStoreNT, [AtomPort0]>; +def : WriteRes<WriteMove, [AtomPort01]>; + +// Treat misc copies as a move. +def : InstRW<[WriteMove], (instrs COPY)>; + +//////////////////////////////////////////////////////////////////////////////// +// Idioms that clear a register, like xorps %xmm0, %xmm0. +// These can often bypass execution ports completely. +//////////////////////////////////////////////////////////////////////////////// + +def : WriteRes<WriteZero, []>; + +//////////////////////////////////////////////////////////////////////////////// +// Branches don't produce values, so they have no latency, but they still +// consume resources. Indirect branches can fold loads. +//////////////////////////////////////////////////////////////////////////////// + +defm : AtomWriteResPair<WriteJump, [AtomPort1], [AtomPort1]>; + +//////////////////////////////////////////////////////////////////////////////// +// Special case scheduling classes. +//////////////////////////////////////////////////////////////////////////////// + +def : WriteRes<WriteSystem, [AtomPort01]> { let Latency = 100; } +def : WriteRes<WriteMicrocoded, [AtomPort01]> { let Latency = 100; } +def : WriteRes<WriteFence, [AtomPort0]>; + +// Nops don't have dependencies, so there's no actual latency, but we set this +// to '1' to tell the scheduler that the nop uses an ALU slot for a cycle. +def : WriteRes<WriteNop, [AtomPort01]>; + +//////////////////////////////////////////////////////////////////////////////// +// Floating point. This covers both scalar and vector operations. +//////////////////////////////////////////////////////////////////////////////// + +defm : X86WriteRes<WriteFLD0, [AtomPort01], 1, [1], 1>; +defm : X86WriteRes<WriteFLD1, [AtomPort01], 6, [6], 1>; +def : WriteRes<WriteFLoad, [AtomPort0]>; +def : WriteRes<WriteFLoadX, [AtomPort0]>; +defm : X86WriteResUnsupported<WriteFLoadY>; +defm : X86WriteResUnsupported<WriteFMaskedLoad>; +defm : X86WriteResUnsupported<WriteFMaskedLoadY>; + +def : WriteRes<WriteFStore, [AtomPort0]>; +def : WriteRes<WriteFStoreX, [AtomPort0]>; +defm : X86WriteResUnsupported<WriteFStoreY>; +def : WriteRes<WriteFStoreNT, [AtomPort0]>; +def : WriteRes<WriteFStoreNTX, [AtomPort0]>; +defm : X86WriteResUnsupported<WriteFStoreNTY>; +defm : X86WriteResUnsupported<WriteFMaskedStore>; +defm : X86WriteResUnsupported<WriteFMaskedStoreY>; + +def : WriteRes<WriteFMove, [AtomPort01]>; +def : WriteRes<WriteFMoveX, [AtomPort01]>; +defm : X86WriteResUnsupported<WriteFMoveY>; + +defm : X86WriteRes<WriteEMMS, [AtomPort01], 5, [5], 1>; + +defm : AtomWriteResPair<WriteFAdd, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>; +defm : AtomWriteResPair<WriteFAddX, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>; +defm : X86WriteResPairUnsupported<WriteFAddY>; +defm : X86WriteResPairUnsupported<WriteFAddZ>; +defm : AtomWriteResPair<WriteFAdd64, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>; +defm : AtomWriteResPair<WriteFAdd64X, [AtomPort01], [AtomPort01], 6, 7, [6], [7]>; +defm : X86WriteResPairUnsupported<WriteFAdd64Y>; +defm : X86WriteResPairUnsupported<WriteFAdd64Z>; +defm : AtomWriteResPair<WriteFCmp, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>; +defm : AtomWriteResPair<WriteFCmpX, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>; +defm : X86WriteResPairUnsupported<WriteFCmpY>; +defm : X86WriteResPairUnsupported<WriteFCmpZ>; +defm : AtomWriteResPair<WriteFCmp64, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>; +defm : AtomWriteResPair<WriteFCmp64X, [AtomPort01], [AtomPort01], 6, 7, [6], [7]>; +defm : X86WriteResPairUnsupported<WriteFCmp64Y>; +defm : X86WriteResPairUnsupported<WriteFCmp64Z>; +defm : AtomWriteResPair<WriteFCom, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>; +defm : AtomWriteResPair<WriteFMul, [AtomPort0], [AtomPort0], 4, 4, [4], [4]>; +defm : AtomWriteResPair<WriteFMulX, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>; +defm : X86WriteResPairUnsupported<WriteFMulY>; +defm : X86WriteResPairUnsupported<WriteFMulZ>; +defm : AtomWriteResPair<WriteFMul64, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>; +defm : AtomWriteResPair<WriteFMul64X, [AtomPort01], [AtomPort01], 9, 10, [9], [10]>; +defm : X86WriteResPairUnsupported<WriteFMul64Y>; +defm : X86WriteResPairUnsupported<WriteFMul64Z>; +defm : AtomWriteResPair<WriteFRcp, [AtomPort0], [AtomPort0], 4, 4, [4], [4]>; +defm : AtomWriteResPair<WriteFRcpX, [AtomPort01], [AtomPort01], 9, 10, [9], [10]>; +defm : X86WriteResPairUnsupported<WriteFRcpY>; +defm : X86WriteResPairUnsupported<WriteFRcpZ>; +defm : AtomWriteResPair<WriteFRsqrt, [AtomPort0], [AtomPort0], 4, 4, [4], [4]>; +defm : AtomWriteResPair<WriteFRsqrtX, [AtomPort01], [AtomPort01], 9, 10, [9], [10]>; +defm : X86WriteResPairUnsupported<WriteFRsqrtY>; +defm : X86WriteResPairUnsupported<WriteFRsqrtZ>; +defm : AtomWriteResPair<WriteFDiv, [AtomPort01], [AtomPort01], 34, 34, [34], [34]>; +defm : AtomWriteResPair<WriteFDivX, [AtomPort01], [AtomPort01], 70, 70, [70], [70]>; +defm : X86WriteResPairUnsupported<WriteFDivY>; +defm : X86WriteResPairUnsupported<WriteFDivZ>; +defm : AtomWriteResPair<WriteFDiv64, [AtomPort01], [AtomPort01], 62, 62, [62], [62]>; +defm : AtomWriteResPair<WriteFDiv64X, [AtomPort01], [AtomPort01],125,125,[125],[125]>; +defm : X86WriteResPairUnsupported<WriteFDiv64Y>; +defm : X86WriteResPairUnsupported<WriteFDiv64Z>; +defm : AtomWriteResPair<WriteFSqrt, [AtomPort01], [AtomPort01], 34, 34, [34], [34]>; +defm : AtomWriteResPair<WriteFSqrtX, [AtomPort01], [AtomPort01], 70, 70, [70], [70]>; +defm : X86WriteResPairUnsupported<WriteFSqrtY>; +defm : X86WriteResPairUnsupported<WriteFSqrtZ>; +defm : AtomWriteResPair<WriteFSqrt64, [AtomPort01], [AtomPort01], 62, 62, [62], [62]>; +defm : AtomWriteResPair<WriteFSqrt64X, [AtomPort01], [AtomPort01],125,125,[125],[125]>; +defm : X86WriteResPairUnsupported<WriteFSqrt64Y>; +defm : X86WriteResPairUnsupported<WriteFSqrt64Z>; +defm : AtomWriteResPair<WriteFSqrt80, [AtomPort01], [AtomPort01], 71, 71, [71], [71]>; +defm : AtomWriteResPair<WriteFSign, [AtomPort1], [AtomPort1]>; +defm : AtomWriteResPair<WriteFRnd, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>; +defm : X86WriteResPairUnsupported<WriteFRndY>; +defm : X86WriteResPairUnsupported<WriteFRndZ>; +defm : AtomWriteResPair<WriteFLogic, [AtomPort01], [AtomPort0]>; +defm : X86WriteResPairUnsupported<WriteFLogicY>; +defm : X86WriteResPairUnsupported<WriteFLogicZ>; +defm : AtomWriteResPair<WriteFTest, [AtomPort01], [AtomPort0]>; +defm : X86WriteResPairUnsupported<WriteFTestY>; +defm : X86WriteResPairUnsupported<WriteFTestZ>; +defm : AtomWriteResPair<WriteFShuffle, [AtomPort0], [AtomPort0]>; +defm : X86WriteResPairUnsupported<WriteFShuffleY>; +defm : X86WriteResPairUnsupported<WriteFShuffleZ>; +defm : X86WriteResPairUnsupported<WriteFVarShuffle>; +defm : X86WriteResPairUnsupported<WriteFVarShuffleY>; +defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>; +defm : X86WriteResPairUnsupported<WriteFMA>; +defm : X86WriteResPairUnsupported<WriteFMAX>; +defm : X86WriteResPairUnsupported<WriteFMAY>; +defm : X86WriteResPairUnsupported<WriteFMAZ>; +defm : X86WriteResPairUnsupported<WriteDPPD>; +defm : X86WriteResPairUnsupported<WriteDPPS>; +defm : X86WriteResPairUnsupported<WriteDPPSY>; +defm : X86WriteResPairUnsupported<WriteDPPSZ>; +defm : X86WriteResPairUnsupported<WriteFBlend>; +defm : X86WriteResPairUnsupported<WriteFBlendY>; +defm : X86WriteResPairUnsupported<WriteFBlendZ>; +defm : X86WriteResPairUnsupported<WriteFVarBlend>; +defm : X86WriteResPairUnsupported<WriteFVarBlendY>; +defm : X86WriteResPairUnsupported<WriteFVarBlendZ>; +defm : X86WriteResPairUnsupported<WriteFShuffle256>; +defm : X86WriteResPairUnsupported<WriteFVarShuffle256>; + +//////////////////////////////////////////////////////////////////////////////// +// Conversions. +//////////////////////////////////////////////////////////////////////////////// + +defm : AtomWriteResPair<WriteCvtSS2I, [AtomPort01], [AtomPort01], 8, 9, [8], [9]>; +defm : AtomWriteResPair<WriteCvtPS2I, [AtomPort01], [AtomPort01], 6, 7, [6], [7]>; +defm : X86WriteResPairUnsupported<WriteCvtPS2IY>; +defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>; +defm : AtomWriteResPair<WriteCvtSD2I, [AtomPort01], [AtomPort01], 8, 9, [8], [9]>; +defm : AtomWriteResPair<WriteCvtPD2I, [AtomPort01], [AtomPort01], 7, 8, [7], [8]>; +defm : X86WriteResPairUnsupported<WriteCvtPD2IY>; +defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>; + +defm : AtomWriteResPair<WriteCvtI2SS, [AtomPort01], [AtomPort01], 6, 7, [6], [7]>; +defm : AtomWriteResPair<WriteCvtI2PS, [AtomPort01], [AtomPort01], 6, 7, [6], [7]>; +defm : X86WriteResPairUnsupported<WriteCvtI2PSY>; +defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>; +defm : AtomWriteResPair<WriteCvtI2SD, [AtomPort01], [AtomPort01], 6, 7, [6], [7]>; +defm : AtomWriteResPair<WriteCvtI2PD, [AtomPort01], [AtomPort01], 7, 8, [7], [8]>; +defm : X86WriteResPairUnsupported<WriteCvtI2PDY>; +defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>; + +defm : AtomWriteResPair<WriteCvtSS2SD, [AtomPort01], [AtomPort01], 6, 7, [6], [7]>; +defm : AtomWriteResPair<WriteCvtPS2PD, [AtomPort01], [AtomPort01], 7, 8, [7], [8]>; +defm : X86WriteResPairUnsupported<WriteCvtPS2PDY>; +defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>; +defm : AtomWriteResPair<WriteCvtSD2SS, [AtomPort01], [AtomPort01], 6, 7, [6], [7]>; +defm : AtomWriteResPair<WriteCvtPD2PS, [AtomPort01], [AtomPort01], 7, 8, [7], [8]>; +defm : X86WriteResPairUnsupported<WriteCvtPD2PSY>; +defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>; + +defm : X86WriteResPairUnsupported<WriteCvtPH2PS>; +defm : X86WriteResPairUnsupported<WriteCvtPH2PSY>; +defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>; +defm : X86WriteResUnsupported<WriteCvtPS2PH>; +defm : X86WriteResUnsupported<WriteCvtPS2PHSt>; +defm : X86WriteResUnsupported<WriteCvtPS2PHY>; +defm : X86WriteResUnsupported<WriteCvtPS2PHZ>; +defm : X86WriteResUnsupported<WriteCvtPS2PHYSt>; +defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>; + +//////////////////////////////////////////////////////////////////////////////// +// Vector integer operations. +//////////////////////////////////////////////////////////////////////////////// + +def : WriteRes<WriteVecLoad, [AtomPort0]>; +def : WriteRes<WriteVecLoadX, [AtomPort0]>; +defm : X86WriteResUnsupported<WriteVecLoadY>; +def : WriteRes<WriteVecLoadNT, [AtomPort0]>; +defm : X86WriteResUnsupported<WriteVecLoadNTY>; +defm : X86WriteResUnsupported<WriteVecMaskedLoad>; +defm : X86WriteResUnsupported<WriteVecMaskedLoadY>; + +def : WriteRes<WriteVecStore, [AtomPort0]>; +def : WriteRes<WriteVecStoreX, [AtomPort0]>; +defm : X86WriteResUnsupported<WriteVecStoreY>; +def : WriteRes<WriteVecStoreNT, [AtomPort0]>; +defm : X86WriteResUnsupported<WriteVecStoreNTY>; +def : WriteRes<WriteVecMaskedStore, [AtomPort0]>; +defm : X86WriteResUnsupported<WriteVecMaskedStoreY>; + +def : WriteRes<WriteVecMove, [AtomPort0]>; +def : WriteRes<WriteVecMoveX, [AtomPort01]>; +defm : X86WriteResUnsupported<WriteVecMoveY>; +defm : X86WriteRes<WriteVecMoveToGpr, [AtomPort0], 3, [3], 1>; +defm : X86WriteRes<WriteVecMoveFromGpr, [AtomPort0], 1, [1], 1>; + +defm : AtomWriteResPair<WriteVecALU, [AtomPort01], [AtomPort0], 1, 1>; +defm : AtomWriteResPair<WriteVecALUX, [AtomPort01], [AtomPort0], 1, 1>; +defm : X86WriteResPairUnsupported<WriteVecALUY>; +defm : X86WriteResPairUnsupported<WriteVecALUZ>; +defm : AtomWriteResPair<WriteVecLogic, [AtomPort01], [AtomPort0], 1, 1>; +defm : AtomWriteResPair<WriteVecLogicX, [AtomPort01], [AtomPort0], 1, 1>; +defm : X86WriteResPairUnsupported<WriteVecLogicY>; +defm : X86WriteResPairUnsupported<WriteVecLogicZ>; +defm : AtomWriteResPair<WriteVecTest, [AtomPort01], [AtomPort0], 1, 1>; +defm : X86WriteResPairUnsupported<WriteVecTestY>; +defm : X86WriteResPairUnsupported<WriteVecTestZ>; +defm : AtomWriteResPair<WriteVecShift, [AtomPort01], [AtomPort01], 2, 3, [2], [3]>; +defm : AtomWriteResPair<WriteVecShiftX, [AtomPort01], [AtomPort01], 2, 3, [2], [3]>; +defm : X86WriteResPairUnsupported<WriteVecShiftY>; +defm : X86WriteResPairUnsupported<WriteVecShiftZ>; +defm : AtomWriteResPair<WriteVecShiftImm, [AtomPort01], [AtomPort01], 1, 1, [1], [1]>; +defm : AtomWriteResPair<WriteVecShiftImmX, [AtomPort01], [AtomPort01], 1, 1, [1], [1]>; +defm : X86WriteResPairUnsupported<WriteVecShiftImmY>; +defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>; +defm : AtomWriteResPair<WriteVecIMul, [AtomPort0], [AtomPort0], 4, 4, [4], [4]>; +defm : AtomWriteResPair<WriteVecIMulX, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>; +defm : X86WriteResPairUnsupported<WriteVecIMulY>; +defm : X86WriteResPairUnsupported<WriteVecIMulZ>; +defm : X86WriteResPairUnsupported<WritePMULLD>; +defm : X86WriteResPairUnsupported<WritePMULLDY>; +defm : X86WriteResPairUnsupported<WritePMULLDZ>; +defm : X86WriteResPairUnsupported<WritePHMINPOS>; +defm : X86WriteResPairUnsupported<WriteMPSAD>; +defm : X86WriteResPairUnsupported<WriteMPSADY>; +defm : X86WriteResPairUnsupported<WriteMPSADZ>; +defm : AtomWriteResPair<WritePSADBW, [AtomPort01], [AtomPort01], 4, 4, [4], [4]>; +defm : AtomWriteResPair<WritePSADBWX, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>; +defm : X86WriteResPairUnsupported<WritePSADBWY>; +defm : X86WriteResPairUnsupported<WritePSADBWZ>; +defm : AtomWriteResPair<WriteShuffle, [AtomPort0], [AtomPort0], 1, 1>; +defm : AtomWriteResPair<WriteShuffleX, [AtomPort0], [AtomPort0], 1, 1>; +defm : X86WriteResPairUnsupported<WriteShuffleY>; +defm : X86WriteResPairUnsupported<WriteShuffleZ>; +defm : AtomWriteResPair<WriteVarShuffle, [AtomPort0], [AtomPort0], 1, 1>; +defm : AtomWriteResPair<WriteVarShuffleX, [AtomPort01], [AtomPort01], 4, 5, [4], [5]>; +defm : X86WriteResPairUnsupported<WriteVarShuffleY>; +defm : X86WriteResPairUnsupported<WriteVarShuffleZ>; +defm : X86WriteResPairUnsupported<WriteBlend>; +defm : X86WriteResPairUnsupported<WriteBlendY>; +defm : X86WriteResPairUnsupported<WriteBlendZ>; +defm : X86WriteResPairUnsupported<WriteVarBlend>; +defm : X86WriteResPairUnsupported<WriteVarBlendY>; +defm : X86WriteResPairUnsupported<WriteVarBlendZ>; +defm : X86WriteResPairUnsupported<WriteShuffle256>; +defm : X86WriteResPairUnsupported<WriteVarShuffle256>; +defm : X86WriteResPairUnsupported<WriteVarVecShift>; +defm : X86WriteResPairUnsupported<WriteVarVecShiftY>; +defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>; + +//////////////////////////////////////////////////////////////////////////////// +// Vector insert/extract operations. +//////////////////////////////////////////////////////////////////////////////// + +defm : AtomWriteResPair<WriteVecInsert, [AtomPort0], [AtomPort0], 1, 1>; +def : WriteRes<WriteVecExtract, [AtomPort0]>; +def : WriteRes<WriteVecExtractSt, [AtomPort0]>; + +//////////////////////////////////////////////////////////////////////////////// +// SSE42 String instructions. +//////////////////////////////////////////////////////////////////////////////// + +defm : X86WriteResPairUnsupported<WritePCmpIStrI>; +defm : X86WriteResPairUnsupported<WritePCmpIStrM>; +defm : X86WriteResPairUnsupported<WritePCmpEStrI>; +defm : X86WriteResPairUnsupported<WritePCmpEStrM>; + +//////////////////////////////////////////////////////////////////////////////// +// MOVMSK Instructions. +//////////////////////////////////////////////////////////////////////////////// + +def : WriteRes<WriteFMOVMSK, [AtomPort0]> { let Latency = 3; let ResourceCycles = [3]; } +def : WriteRes<WriteVecMOVMSK, [AtomPort0]> { let Latency = 3; let ResourceCycles = [3]; } +defm : X86WriteResUnsupported<WriteVecMOVMSKY>; +def : WriteRes<WriteMMXMOVMSK, [AtomPort0]> { let Latency = 3; let ResourceCycles = [3]; } + +//////////////////////////////////////////////////////////////////////////////// +// AES instructions. +//////////////////////////////////////////////////////////////////////////////// + +defm : X86WriteResPairUnsupported<WriteAESIMC>; +defm : X86WriteResPairUnsupported<WriteAESKeyGen>; +defm : X86WriteResPairUnsupported<WriteAESDecEnc>; + +//////////////////////////////////////////////////////////////////////////////// +// Horizontal add/sub instructions. +//////////////////////////////////////////////////////////////////////////////// + +defm : AtomWriteResPair<WriteFHAdd, [AtomPort01], [AtomPort01], 8, 9, [8], [9]>; +defm : AtomWriteResPair<WriteFHAddY, [AtomPort01], [AtomPort01], 8, 9, [8], [9]>; +defm : AtomWriteResPair<WritePHAdd, [AtomPort01], [AtomPort01], 3, 4, [3], [4]>; +defm : AtomWriteResPair<WritePHAddX, [AtomPort01], [AtomPort01], 7, 8, [7], [8]>; +defm : AtomWriteResPair<WritePHAddY, [AtomPort01], [AtomPort01], 7, 8, [7], [8]>; + +//////////////////////////////////////////////////////////////////////////////// +// Carry-less multiplication instructions. +//////////////////////////////////////////////////////////////////////////////// + +defm : X86WriteResPairUnsupported<WriteCLMul>; + +//////////////////////////////////////////////////////////////////////////////// +// Load/store MXCSR. +//////////////////////////////////////////////////////////////////////////////// + +def : WriteRes<WriteLDMXCSR, [AtomPort01]> { let Latency = 5; let ResourceCycles = [5]; } +def : WriteRes<WriteSTMXCSR, [AtomPort01]> { let Latency = 15; let ResourceCycles = [15]; } + +//////////////////////////////////////////////////////////////////////////////// +// Special Cases. +//////////////////////////////////////////////////////////////////////////////// + +// Port0 +def AtomWrite0_1 : SchedWriteRes<[AtomPort0]> { + let Latency = 1; + let ResourceCycles = [1]; +} +def : InstRW<[AtomWrite0_1], (instrs FXAM, LD_Frr, + MOVSX64rr32)>; +def : SchedAlias<WriteALURMW, AtomWrite0_1>; +def : SchedAlias<WriteADCRMW, AtomWrite0_1>; +def : InstRW<[AtomWrite0_1], (instregex "(RCL|RCR|ROL|ROR|SAR|SHL|SHR)(8|16|32|64)m", + "MOV(S|Z)X(32|64)rr(8|8_NOREX|16)")>; + +def AtomWrite0_5 : SchedWriteRes<[AtomPort0]> { + let Latency = 5; + let ResourceCycles = [5]; +} +def : InstRW<[AtomWrite0_5], (instregex "IMUL32(rm|rr)")>; + +// Port1 +def AtomWrite1_1 : SchedWriteRes<[AtomPort1]> { + let Latency = 1; + let ResourceCycles = [1]; +} +def : InstRW<[AtomWrite1_1], (instrs FCOMPP)>; +def : InstRW<[AtomWrite1_1], (instregex "UCOM_F(P|PP)?r", + "BT(C|R|S)?(16|32|64)(rr|ri8)")>; + +def AtomWrite1_5 : SchedWriteRes<[AtomPort1]> { + let Latency = 5; + let ResourceCycles = [5]; +} +def : InstRW<[AtomWrite1_5], (instrs MMX_CVTPI2PSirr, MMX_CVTPI2PSirm, + MMX_CVTPS2PIirr, MMX_CVTTPS2PIirr)>; + +// Port0 and Port1 +def AtomWrite0_1_1 : SchedWriteRes<[AtomPort0, AtomPort1]> { + let Latency = 1; + let ResourceCycles = [1, 1]; +} +def : InstRW<[AtomWrite0_1_1], (instrs POP32r, POP64r, + POP16rmr, POP32rmr, POP64rmr, + PUSH16r, PUSH32r, PUSH64r, + PUSHi16, PUSHi32, + PUSH16rmr, PUSH32rmr, PUSH64rmr, + PUSH16i8, PUSH32i8, PUSH64i8, PUSH64i32, + XCH_F)>; +def : InstRW<[AtomWrite0_1_1], (instregex "RETI(L|Q|W)$", + "IRET(16|32|64)?")>; + +def AtomWrite0_1_5 : SchedWriteRes<[AtomPort0, AtomPort1]> { + let Latency = 5; + let ResourceCycles = [5, 5]; +} +def : InstRW<[AtomWrite0_1_5], (instrs MMX_CVTPS2PIirm, MMX_CVTTPS2PIirm)>; +def : InstRW<[AtomWrite0_1_5], (instregex "ILD_F(16|32|64)")>; + +// Port0 or Port1 +def AtomWrite01_1 : SchedWriteRes<[AtomPort01]> { + let Latency = 1; + let ResourceCycles = [1]; +} +def : InstRW<[AtomWrite01_1], (instrs FDECSTP, FFREE, FFREEP, FINCSTP, WAIT, + LFENCE, + STOSB, STOSL, STOSQ, STOSW, + MOVSSrr, MOVSSrr_REV, + PSLLDQri, PSRLDQri)>; +def : InstRW<[AtomWrite01_1], (instregex "MMX_PACK(SSDW|SSWB|USWB)irr", + "MMX_PUNPCKH(BW|DQ|WD)irr")>; + +def AtomWrite01_2 : SchedWriteRes<[AtomPort01]> { + let Latency = 2; + let ResourceCycles = [2]; +} +def : InstRW<[AtomWrite01_2], (instrs LEAVE, LEAVE64, POP16r, + PUSH16rmm, PUSH32rmm, PUSH64rmm, + LODSB, LODSL, LODSQ, LODSW, + SCASB, SCASL, SCASQ, SCASW)>; +def : InstRW<[AtomWrite01_2], (instregex "BT(C|R|S)(16|32|64)mi8", + "PUSH(CS|DS|ES|FS|GS|SS)(16|32|64)", + "XADD(8|16|32|64)rr", + "XCHG(8|16|32|64)(ar|rr)", + "(ST|ISTT)_F(P)?(16|32|64)?(m|rr)", + "MMX_P(ADD|SUB)Qirr", + "MOV(S|Z)X16rr8", + "MOV(UPS|UPD|DQU)mr", + "MASKMOVDQU(64)?", + "P(ADD|SUB)Qrr")>; + +def AtomWrite01_3 : SchedWriteRes<[AtomPort01]> { + let Latency = 3; + let ResourceCycles = [3]; +} +def : InstRW<[AtomWrite01_3], (instrs CLD, LDDQUrm, + CMPSB, CMPSL, CMPSQ, CMPSW, + MOVSB, MOVSL, MOVSQ, MOVSW, + POP16rmm, POP32rmm, POP64rmm)>; +def : InstRW<[AtomWrite01_3], (instregex "XADD(8|16|32|64)rm", + "XCHG(8|16|32|64)rm", + "PH(ADD|SUB)Drr", + "MOV(S|Z)X16rm8", + "MMX_P(ADD|SUB)Qirm", + "MOV(UPS|UPD|DQU)rm", + "P(ADD|SUB)Qrm")>; + +def AtomWrite01_4 : SchedWriteRes<[AtomPort01]> { + let Latency = 4; + let ResourceCycles = [4]; +} +def : InstRW<[AtomWrite01_4], (instrs CBW, CWD, CWDE, CDQ, CDQE, CQO, + JCXZ, JECXZ, JRCXZ, + LD_F80m)>; +def : InstRW<[AtomWrite01_4], (instregex "PH(ADD|SUB)Drm", + "(MMX_)?PEXTRWrr(_REV)?")>; + +def AtomWrite01_5 : SchedWriteRes<[AtomPort01]> { + let Latency = 5; + let ResourceCycles = [5]; +} +def : InstRW<[AtomWrite01_5], (instrs FLDCW16m, ST_FP80m)>; +def : InstRW<[AtomWrite01_5], (instregex "MMX_PH(ADD|SUB)S?Wrr")>; + +def AtomWrite01_6 : SchedWriteRes<[AtomPort01]> { + let Latency = 6; + let ResourceCycles = [6]; +} +def : InstRW<[AtomWrite01_6], (instrs CMPXCHG8rm, INTO, XLAT, + SHLD16rrCL, SHRD16rrCL, + SHLD16rri8, SHRD16rri8, + SHLD16mrCL, SHRD16mrCL, + SHLD16mri8, SHRD16mri8)>; +def : InstRW<[AtomWrite01_6], (instregex "IMUL16rr", + "IST_F(P)?(16|32|64)?m", + "MMX_PH(ADD|SUB)S?Wrm")>; + +def AtomWrite01_7 : SchedWriteRes<[AtomPort01]> { + let Latency = 7; + let ResourceCycles = [7]; +} +def : InstRW<[AtomWrite01_7], (instrs AAD8i8)>; + +def AtomWrite01_8 : SchedWriteRes<[AtomPort01]> { + let Latency = 8; + let ResourceCycles = [8]; +} +def : InstRW<[AtomWrite01_8], (instrs LOOPE, + PUSHA16, PUSHA32, + SHLD64rrCL, SHRD64rrCL, + FNSTCW16m)>; + +def AtomWrite01_9 : SchedWriteRes<[AtomPort01]> { + let Latency = 9; + let ResourceCycles = [9]; +} +def : InstRW<[AtomWrite01_9], (instrs BT16mr, BT32mr, BT64mr, + POPA16, POPA32, + PUSHF16, PUSHF32, PUSHF64, + SHLD64mrCL, SHRD64mrCL, + SHLD64mri8, SHRD64mri8, + SHLD64rri8, SHRD64rri8, + CMPXCHG8rr)>; +def : InstRW<[AtomWrite01_9], (instregex "(U)?COM_FI", "TST_F", + "(U)?COMIS(D|S)rr", + "CVT(T)?SS2SI64rr(_Int)?")>; + +def AtomWrite01_10 : SchedWriteRes<[AtomPort01]> { + let Latency = 10; + let ResourceCycles = [10]; +} +def : SchedAlias<WriteFLDC, AtomWrite01_10>; +def : InstRW<[AtomWrite01_10], (instregex "(U)?COMIS(D|S)rm", + "CVT(T)?SS2SI64rm(_Int)?")>; + +def AtomWrite01_11 : SchedWriteRes<[AtomPort01]> { + let Latency = 11; + let ResourceCycles = [11]; +} +def : InstRW<[AtomWrite01_11], (instrs BOUNDS16rm, BOUNDS32rm)>; +def : InstRW<[AtomWrite01_11], (instregex "BT(C|R|S)(16|32|64)mr")>; + +def AtomWrite01_13 : SchedWriteRes<[AtomPort01]> { + let Latency = 13; + let ResourceCycles = [13]; +} +def : InstRW<[AtomWrite01_13], (instrs AAA, AAS)>; + +def AtomWrite01_14 : SchedWriteRes<[AtomPort01]> { + let Latency = 14; + let ResourceCycles = [14]; +} +def : InstRW<[AtomWrite01_14], (instrs CMPXCHG16rm, CMPXCHG32rm, CMPXCHG64rm)>; + +def AtomWrite01_15 : SchedWriteRes<[AtomPort01]> { + let Latency = 15; + let ResourceCycles = [15]; +} +def : InstRW<[AtomWrite01_15], (instrs CMPXCHG16rr, CMPXCHG32rr, CMPXCHG64rr)>; + +def AtomWrite01_17 : SchedWriteRes<[AtomPort01]> { + let Latency = 17; + let ResourceCycles = [17]; +} +def : InstRW<[AtomWrite01_17], (instrs LOOPNE, PAUSE)>; + +def AtomWrite01_18 : SchedWriteRes<[AtomPort01]> { + let Latency = 18; + let ResourceCycles = [18]; +} +def : InstRW<[AtomWrite01_18], (instrs CMPXCHG8B, DAA, LOOP)>; + +def AtomWrite01_20 : SchedWriteRes<[AtomPort01]> { + let Latency = 20; + let ResourceCycles = [20]; +} +def : InstRW<[AtomWrite01_20], (instrs DAS)>; + +def AtomWrite01_21 : SchedWriteRes<[AtomPort01]> { + let Latency = 21; + let ResourceCycles = [21]; +} +def : InstRW<[AtomWrite01_21], (instrs AAM8i8, STD)>; + +def AtomWrite01_22 : SchedWriteRes<[AtomPort01]> { + let Latency = 22; + let ResourceCycles = [22]; +} +def : InstRW<[AtomWrite01_22], (instrs CMPXCHG16B)>; + +def AtomWrite01_23 : SchedWriteRes<[AtomPort01]> { + let Latency = 23; + let ResourceCycles = [23]; +} +def : InstRW<[AtomWrite01_23], (instrs ARPL16mr, ARPL16rr)>; + +def AtomWrite01_25 : SchedWriteRes<[AtomPort01]> { + let Latency = 25; + let ResourceCycles = [25]; +} +def : InstRW<[AtomWrite01_25], (instrs FNCLEX, FXTRACT)>; + +def AtomWrite01_26 : SchedWriteRes<[AtomPort01]> { + let Latency = 26; + let ResourceCycles = [26]; +} +def : InstRW<[AtomWrite01_26], (instrs POPF32, POPF64)>; + +def AtomWrite01_29 : SchedWriteRes<[AtomPort01]> { + let Latency = 29; + let ResourceCycles = [29]; +} +def : InstRW<[AtomWrite01_29], (instregex "POP(DS|ES|FS|GS)(16|32|64)")>; + +def AtomWrite01_30 : SchedWriteRes<[AtomPort01]> { + let Latency = 30; + let ResourceCycles = [30]; +} +def : InstRW<[AtomWrite01_30], (instrs RDTSC, RDTSCP)>; + +def AtomWrite01_32 : SchedWriteRes<[AtomPort01]> { + let Latency = 32; + let ResourceCycles = [32]; +} +def : InstRW<[AtomWrite01_32], (instrs ENTER, POPF16)>; + +def AtomWrite01_45 : SchedWriteRes<[AtomPort01]> { + let Latency = 45; + let ResourceCycles = [45]; +} +def : InstRW<[AtomWrite01_45], (instrs MONITORrrr)>; + +def AtomWrite01_46 : SchedWriteRes<[AtomPort01]> { + let Latency = 46; + let ResourceCycles = [46]; +} +def : InstRW<[AtomWrite01_46], (instrs FRNDINT, MWAITrr, RDPMC)>; + +def AtomWrite01_48 : SchedWriteRes<[AtomPort01]> { + let Latency = 48; + let ResourceCycles = [48]; +} +def : InstRW<[AtomWrite01_48], (instrs POPSS16, POPSS32)>; + +def AtomWrite01_55 : SchedWriteRes<[AtomPort01]> { + let Latency = 55; + let ResourceCycles = [55]; +} +def : InstRW<[AtomWrite01_55], (instrs FPREM)>; + +def AtomWrite01_59 : SchedWriteRes<[AtomPort01]> { + let Latency = 59; + let ResourceCycles = [59]; +} +def : InstRW<[AtomWrite01_59], (instrs INSB, INSL, INSW)>; + +def AtomWrite01_63 : SchedWriteRes<[AtomPort01]> { + let Latency = 63; + let ResourceCycles = [63]; +} +def : InstRW<[AtomWrite01_63], (instrs FNINIT)>; + +def AtomWrite01_68 : SchedWriteRes<[AtomPort01]> { + let Latency = 68; + let ResourceCycles = [68]; +} +def : InstRW<[AtomWrite01_68], (instrs OUT8rr, OUT16rr, OUT32rr)>; + +def AtomWrite01_71 : SchedWriteRes<[AtomPort01]> { + let Latency = 71; + let ResourceCycles = [71]; +} +def : InstRW<[AtomWrite01_71], (instrs FPREM1, + INVLPG, INVLPGA32, INVLPGA64)>; + +def AtomWrite01_72 : SchedWriteRes<[AtomPort01]> { + let Latency = 72; + let ResourceCycles = [72]; +} +def : InstRW<[AtomWrite01_72], (instrs OUT8ir, OUT16ir, OUT32ir)>; + +def AtomWrite01_74 : SchedWriteRes<[AtomPort01]> { + let Latency = 74; + let ResourceCycles = [74]; +} +def : InstRW<[AtomWrite01_74], (instrs OUTSB, OUTSL, OUTSW)>; + +def AtomWrite01_77 : SchedWriteRes<[AtomPort01]> { + let Latency = 77; + let ResourceCycles = [77]; +} +def : InstRW<[AtomWrite01_77], (instrs FSCALE)>; + +def AtomWrite01_78 : SchedWriteRes<[AtomPort01]> { + let Latency = 78; + let ResourceCycles = [78]; +} +def : InstRW<[AtomWrite01_78], (instrs RDMSR)>; + +def AtomWrite01_79 : SchedWriteRes<[AtomPort01]> { + let Latency = 79; + let ResourceCycles = [79]; +} +def : InstRW<[AtomWrite01_79], (instregex "RET(L|Q|W)?$", + "LRETI?(L|Q|W)")>; + +def AtomWrite01_92 : SchedWriteRes<[AtomPort01]> { + let Latency = 92; + let ResourceCycles = [92]; +} +def : InstRW<[AtomWrite01_92], (instrs IN8ri, IN16ri, IN32ri)>; + +def AtomWrite01_94 : SchedWriteRes<[AtomPort01]> { + let Latency = 94; + let ResourceCycles = [94]; +} +def : InstRW<[AtomWrite01_94], (instrs IN8rr, IN16rr, IN32rr)>; + +def AtomWrite01_99 : SchedWriteRes<[AtomPort01]> { + let Latency = 99; + let ResourceCycles = [99]; +} +def : InstRW<[AtomWrite01_99], (instrs F2XM1)>; + +def AtomWrite01_121 : SchedWriteRes<[AtomPort01]> { + let Latency = 121; + let ResourceCycles = [121]; +} +def : InstRW<[AtomWrite01_121], (instrs CPUID)>; + +def AtomWrite01_127 : SchedWriteRes<[AtomPort01]> { + let Latency = 127; + let ResourceCycles = [127]; +} +def : InstRW<[AtomWrite01_127], (instrs INT)>; + +def AtomWrite01_130 : SchedWriteRes<[AtomPort01]> { + let Latency = 130; + let ResourceCycles = [130]; +} +def : InstRW<[AtomWrite01_130], (instrs INT3)>; + +def AtomWrite01_140 : SchedWriteRes<[AtomPort01]> { + let Latency = 140; + let ResourceCycles = [140]; +} +def : InstRW<[AtomWrite01_140], (instrs FXSAVE, FXSAVE64)>; + +def AtomWrite01_141 : SchedWriteRes<[AtomPort01]> { + let Latency = 141; + let ResourceCycles = [141]; +} +def : InstRW<[AtomWrite01_141], (instrs FXRSTOR, FXRSTOR64)>; + +def AtomWrite01_146 : SchedWriteRes<[AtomPort01]> { + let Latency = 146; + let ResourceCycles = [146]; +} +def : InstRW<[AtomWrite01_146], (instrs FYL2X)>; + +def AtomWrite01_147 : SchedWriteRes<[AtomPort01]> { + let Latency = 147; + let ResourceCycles = [147]; +} +def : InstRW<[AtomWrite01_147], (instrs FYL2XP1)>; + +def AtomWrite01_168 : SchedWriteRes<[AtomPort01]> { + let Latency = 168; + let ResourceCycles = [168]; +} +def : InstRW<[AtomWrite01_168], (instrs FPTAN)>; + +def AtomWrite01_174 : SchedWriteRes<[AtomPort01]> { + let Latency = 174; + let ResourceCycles = [174]; +} +def : InstRW<[AtomWrite01_174], (instrs FSINCOS)>; +def : InstRW<[AtomWrite01_174], (instregex "(COS|SIN)_F")>; + +def AtomWrite01_183 : SchedWriteRes<[AtomPort01]> { + let Latency = 183; + let ResourceCycles = [183]; +} +def : InstRW<[AtomWrite01_183], (instrs FPATAN)>; + +def AtomWrite01_202 : SchedWriteRes<[AtomPort01]> { + let Latency = 202; + let ResourceCycles = [202]; +} +def : InstRW<[AtomWrite01_202], (instrs WRMSR)>; + +} // SchedModel diff --git a/capstone/suite/synctools/tablegen/X86/back/X86ScheduleBtVer2.td b/capstone/suite/synctools/tablegen/X86/back/X86ScheduleBtVer2.td new file mode 100644 index 000000000..719e71cd2 --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86ScheduleBtVer2.td @@ -0,0 +1,682 @@ +//=- X86ScheduleBtVer2.td - X86 BtVer2 (Jaguar) Scheduling ---*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for AMD btver2 (Jaguar) to support +// instruction scheduling and other instruction cost heuristics. Based off AMD Software +// Optimization Guide for AMD Family 16h Processors & Instruction Latency appendix. +// +//===----------------------------------------------------------------------===// + +def BtVer2Model : SchedMachineModel { + // All x86 instructions are modeled as a single micro-op, and btver2 can + // decode 2 instructions per cycle. + let IssueWidth = 2; + let MicroOpBufferSize = 64; // Retire Control Unit + let LoadLatency = 5; // FPU latency (worse case cf Integer 3 cycle latency) + let HighLatency = 25; + let MispredictPenalty = 14; // Minimum branch misdirection penalty + let PostRAScheduler = 1; + + // FIXME: SSE4/AVX is unimplemented. This flag is set to allow + // the scheduler to assign a default model to unrecognized opcodes. + let CompleteModel = 0; +} + +let SchedModel = BtVer2Model in { + +// Jaguar can issue up to 6 micro-ops in one cycle +def JALU0 : ProcResource<1>; // Integer Pipe0: integer ALU0 (also handle FP->INT jam) +def JALU1 : ProcResource<1>; // Integer Pipe1: integer ALU1/MUL/DIV +def JLAGU : ProcResource<1>; // Integer Pipe2: LAGU +def JSAGU : ProcResource<1>; // Integer Pipe3: SAGU (also handles 3-operand LEA) +def JFPU0 : ProcResource<1>; // Vector/FPU Pipe0: VALU0/VIMUL/FPA +def JFPU1 : ProcResource<1>; // Vector/FPU Pipe1: VALU1/STC/FPM + +// The Integer PRF for Jaguar is 64 entries, and it holds the architectural and +// speculative version of the 64-bit integer registers. +// Reference: www.realworldtech.com/jaguar/4/ +// +// The processor always keeps the different parts of an integer register +// together. An instruction that writes to a part of a register will therefore +// have a false dependence on any previous write to the same register or any +// part of it. +// Reference: Section 21.10 "AMD Bobcat and Jaguar pipeline: Partial register +// access" - Agner Fog's "microarchitecture.pdf". +def JIntegerPRF : RegisterFile<64, [GR64, CCR]>; + +// The Jaguar FP Retire Queue renames SIMD and FP uOps onto a pool of 72 SSE +// registers. Operations on 256-bit data types are cracked into two COPs. +// Reference: www.realworldtech.com/jaguar/4/ +def JFpuPRF: RegisterFile<72, [VR64, VR128, VR256], [1, 1, 2]>; + +// The retire control unit (RCU) can track up to 64 macro-ops in-flight. It can +// retire up to two macro-ops per cycle. +// Reference: "Software Optimization Guide for AMD Family 16h Processors" +def JRCU : RetireControlUnit<64, 2>; + +// Integer Pipe Scheduler +def JALU01 : ProcResGroup<[JALU0, JALU1]> { + let BufferSize=20; +} + +// AGU Pipe Scheduler +def JLSAGU : ProcResGroup<[JLAGU, JSAGU]> { + let BufferSize=12; +} + +// Fpu Pipe Scheduler +def JFPU01 : ProcResGroup<[JFPU0, JFPU1]> { + let BufferSize=18; +} + +// Functional units +def JDiv : ProcResource<1>; // integer division +def JMul : ProcResource<1>; // integer multiplication +def JVALU0 : ProcResource<1>; // vector integer +def JVALU1 : ProcResource<1>; // vector integer +def JVIMUL : ProcResource<1>; // vector integer multiplication +def JSTC : ProcResource<1>; // vector store/convert +def JFPM : ProcResource<1>; // FP multiplication +def JFPA : ProcResource<1>; // FP addition + +// Functional unit groups +def JFPX : ProcResGroup<[JFPA, JFPM]>; +def JVALU : ProcResGroup<[JVALU0, JVALU1]>; + +// Integer loads are 3 cycles, so ReadAfterLd registers needn't be available until 3 +// cycles after the memory operand. +def : ReadAdvance<ReadAfterLd, 3>; + +// Many SchedWrites are defined in pairs with and without a folded load. +// Instructions with folded loads are usually micro-fused, so they only appear +// as two micro-ops when dispatched by the schedulers. +// This multiclass defines the resource usage for variants with and without +// folded loads. +multiclass JWriteResIntPair<X86FoldableSchedWrite SchedRW, + list<ProcResourceKind> ExePorts, + int Lat, list<int> Res = [], int UOps = 1> { + // Register variant is using a single cycle on ExePort. + def : WriteRes<SchedRW, ExePorts> { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } + + // Memory variant also uses a cycle on JLAGU and adds 3 cycles to the + // latency. + def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> { + let Latency = !add(Lat, 3); + let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res)); + let NumMicroOps = UOps; + } +} + +multiclass JWriteResFpuPair<X86FoldableSchedWrite SchedRW, + list<ProcResourceKind> ExePorts, + int Lat, list<int> Res = [], int UOps = 1> { + // Register variant is using a single cycle on ExePort. + def : WriteRes<SchedRW, ExePorts> { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } + + // Memory variant also uses a cycle on JLAGU and adds 5 cycles to the + // latency. + def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> { + let Latency = !add(Lat, 5); + let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res)); + let NumMicroOps = UOps; + } +} + +multiclass JWriteResYMMPair<X86FoldableSchedWrite SchedRW, + list<ProcResourceKind> ExePorts, + int Lat, list<int> Res = [2], int UOps = 2> { + // Register variant is using a single cycle on ExePort. + def : WriteRes<SchedRW, ExePorts> { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } + + // Memory variant also uses 2 cycles on JLAGU and adds 5 cycles to the + // latency. + def : WriteRes<SchedRW.Folded, !listconcat([JLAGU], ExePorts)> { + let Latency = !add(Lat, 5); + let ResourceCycles = !listconcat([2], Res); + let NumMicroOps = UOps; + } +} + +// A folded store needs a cycle on the SAGU for the store data. +def : WriteRes<WriteRMW, [JSAGU]>; + +//////////////////////////////////////////////////////////////////////////////// +// Arithmetic. +//////////////////////////////////////////////////////////////////////////////// + +defm : JWriteResIntPair<WriteALU, [JALU01], 1>; +defm : JWriteResIntPair<WriteADC, [JALU01], 1, [2]>; +defm : JWriteResIntPair<WriteIMul, [JALU1, JMul], 3, [1, 1], 2>; // i8/i16/i32 multiplication +defm : JWriteResIntPair<WriteIMul64, [JALU1, JMul], 6, [1, 4], 2>; // i64 multiplication +defm : X86WriteRes<WriteIMulH, [JALU1], 6, [4], 1>; + +defm : X86WriteRes<WriteBSWAP32, [JALU01], 1, [1], 1>; +defm : X86WriteRes<WriteBSWAP64, [JALU01], 1, [1], 1>; + +defm : JWriteResIntPair<WriteDiv8, [JALU1, JDiv], 12, [1, 12], 1>; +defm : JWriteResIntPair<WriteDiv16, [JALU1, JDiv], 17, [1, 17], 2>; +defm : JWriteResIntPair<WriteDiv32, [JALU1, JDiv], 25, [1, 25], 2>; +defm : JWriteResIntPair<WriteDiv64, [JALU1, JDiv], 41, [1, 41], 2>; +defm : JWriteResIntPair<WriteIDiv8, [JALU1, JDiv], 12, [1, 12], 1>; +defm : JWriteResIntPair<WriteIDiv16, [JALU1, JDiv], 17, [1, 17], 2>; +defm : JWriteResIntPair<WriteIDiv32, [JALU1, JDiv], 25, [1, 25], 2>; +defm : JWriteResIntPair<WriteIDiv64, [JALU1, JDiv], 41, [1, 41], 2>; + +defm : JWriteResIntPair<WriteCRC32, [JALU01], 3, [4], 3>; + +defm : JWriteResIntPair<WriteCMOV, [JALU01], 1>; // Conditional move. +defm : JWriteResIntPair<WriteCMOV2, [JALU01], 1>; // Conditional (CF + ZF flag) move. +defm : X86WriteRes<WriteFCMOV, [JFPU0, JFPA], 3, [1,1], 1>; // x87 conditional move. +def : WriteRes<WriteSETCC, [JALU01]>; // Setcc. +def : WriteRes<WriteSETCCStore, [JALU01,JSAGU]>; +def : WriteRes<WriteLAHFSAHF, [JALU01]>; +def : WriteRes<WriteBitTest,[JALU01]>; + +// This is for simple LEAs with one or two input operands. +def : WriteRes<WriteLEA, [JALU01]>; + +// Bit counts. +defm : JWriteResIntPair<WriteBSF, [JALU01], 5, [4], 8>; +defm : JWriteResIntPair<WriteBSR, [JALU01], 5, [4], 8>; +defm : JWriteResIntPair<WritePOPCNT, [JALU01], 1>; +defm : JWriteResIntPair<WriteLZCNT, [JALU01], 1>; +defm : JWriteResIntPair<WriteTZCNT, [JALU01], 2, [2]>; + +// BMI1 BEXTR, BMI2 BZHI +defm : JWriteResIntPair<WriteBEXTR, [JALU01], 1>; +defm : X86WriteResPairUnsupported<WriteBZHI>; + +//////////////////////////////////////////////////////////////////////////////// +// Integer shifts and rotates. +//////////////////////////////////////////////////////////////////////////////// + +defm : JWriteResIntPair<WriteShift, [JALU01], 1>; + +// SHLD/SHRD. +defm : X86WriteRes<WriteSHDrri, [JALU01], 3, [6], 6>; +defm : X86WriteRes<WriteSHDrrcl,[JALU01], 4, [8], 7>; +defm : X86WriteRes<WriteSHDmri, [JLAGU, JALU01], 9, [1, 22], 8>; +defm : X86WriteRes<WriteSHDmrcl,[JLAGU, JALU01], 9, [1, 22], 8>; + +//////////////////////////////////////////////////////////////////////////////// +// Loads, stores, and moves, not folded with other operations. +//////////////////////////////////////////////////////////////////////////////// + +def : WriteRes<WriteLoad, [JLAGU]> { let Latency = 5; } +def : WriteRes<WriteStore, [JSAGU]>; +def : WriteRes<WriteStoreNT, [JSAGU]>; +def : WriteRes<WriteMove, [JALU01]>; + +// Load/store MXCSR. +// FIXME: These are copy and pasted from WriteLoad/Store. +def : WriteRes<WriteLDMXCSR, [JLAGU]> { let Latency = 5; } +def : WriteRes<WriteSTMXCSR, [JSAGU]>; + +// Treat misc copies as a move. +def : InstRW<[WriteMove], (instrs COPY)>; + +//////////////////////////////////////////////////////////////////////////////// +// Idioms that clear a register, like xorps %xmm0, %xmm0. +// These can often bypass execution ports completely. +//////////////////////////////////////////////////////////////////////////////// + +def : WriteRes<WriteZero, []>; + +//////////////////////////////////////////////////////////////////////////////// +// Branches don't produce values, so they have no latency, but they still +// consume resources. Indirect branches can fold loads. +//////////////////////////////////////////////////////////////////////////////// + +defm : JWriteResIntPair<WriteJump, [JALU01], 1>; + +//////////////////////////////////////////////////////////////////////////////// +// Special case scheduling classes. +//////////////////////////////////////////////////////////////////////////////// + +def : WriteRes<WriteSystem, [JALU01]> { let Latency = 100; } +def : WriteRes<WriteMicrocoded, [JALU01]> { let Latency = 100; } +def : WriteRes<WriteFence, [JSAGU]>; + +// Nops don't have dependencies, so there's no actual latency, but we set this +// to '1' to tell the scheduler that the nop uses an ALU slot for a cycle. +def : WriteRes<WriteNop, [JALU01]> { let Latency = 1; } + +//////////////////////////////////////////////////////////////////////////////// +// Floating point. This covers both scalar and vector operations. +//////////////////////////////////////////////////////////////////////////////// + +defm : X86WriteRes<WriteFLD0, [JFPU1, JSTC], 3, [1,1], 1>; +defm : X86WriteRes<WriteFLD1, [JFPU1, JSTC], 3, [1,1], 1>; +defm : X86WriteRes<WriteFLDC, [JFPU1, JSTC], 3, [1,1], 1>; +defm : X86WriteRes<WriteFLoad, [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>; +defm : X86WriteRes<WriteFLoadX, [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>; +defm : X86WriteRes<WriteFLoadY, [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>; +defm : X86WriteRes<WriteFMaskedLoad, [JLAGU, JFPU01, JFPX], 6, [1, 1, 2], 1>; +defm : X86WriteRes<WriteFMaskedLoadY, [JLAGU, JFPU01, JFPX], 6, [2, 2, 4], 2>; + +defm : X86WriteRes<WriteFStore, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>; +defm : X86WriteRes<WriteFStoreX, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>; +defm : X86WriteRes<WriteFStoreY, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>; +defm : X86WriteRes<WriteFStoreNT, [JSAGU, JFPU1, JSTC], 3, [1, 1, 1], 1>; +defm : X86WriteRes<WriteFStoreNTX, [JSAGU, JFPU1, JSTC], 3, [1, 1, 1], 1>; +defm : X86WriteRes<WriteFStoreNTY, [JSAGU, JFPU1, JSTC], 3, [2, 2, 2], 1>; +defm : X86WriteRes<WriteFMaskedStore, [JSAGU, JFPU01, JFPX], 6, [1, 1, 4], 1>; +defm : X86WriteRes<WriteFMaskedStoreY, [JSAGU, JFPU01, JFPX], 6, [2, 2, 4], 2>; + +defm : X86WriteRes<WriteFMove, [JFPU01, JFPX], 1, [1, 1], 1>; +defm : X86WriteRes<WriteFMoveX, [JFPU01, JFPX], 1, [1, 1], 1>; +defm : X86WriteRes<WriteFMoveY, [JFPU01, JFPX], 1, [2, 2], 2>; + +defm : X86WriteRes<WriteEMMS, [JFPU01, JFPX], 2, [1, 1], 1>; + +defm : JWriteResFpuPair<WriteFAdd, [JFPU0, JFPA], 3>; +defm : JWriteResFpuPair<WriteFAddX, [JFPU0, JFPA], 3>; +defm : JWriteResYMMPair<WriteFAddY, [JFPU0, JFPA], 3, [2,2], 2>; +defm : X86WriteResPairUnsupported<WriteFAddZ>; +defm : JWriteResFpuPair<WriteFAdd64, [JFPU0, JFPA], 3>; +defm : JWriteResFpuPair<WriteFAdd64X, [JFPU0, JFPA], 3>; +defm : JWriteResYMMPair<WriteFAdd64Y, [JFPU0, JFPA], 3, [2,2], 2>; +defm : X86WriteResPairUnsupported<WriteFAdd64Z>; +defm : JWriteResFpuPair<WriteFCmp, [JFPU0, JFPA], 2>; +defm : JWriteResFpuPair<WriteFCmpX, [JFPU0, JFPA], 2>; +defm : JWriteResYMMPair<WriteFCmpY, [JFPU0, JFPA], 2, [2,2], 2>; +defm : X86WriteResPairUnsupported<WriteFCmpZ>; +defm : JWriteResFpuPair<WriteFCmp64, [JFPU0, JFPA], 2>; +defm : JWriteResFpuPair<WriteFCmp64X, [JFPU0, JFPA], 2>; +defm : JWriteResYMMPair<WriteFCmp64Y, [JFPU0, JFPA], 2, [2,2], 2>; +defm : X86WriteResPairUnsupported<WriteFCmp64Z>; +defm : JWriteResFpuPair<WriteFCom, [JFPU0, JFPA, JALU0], 3>; +defm : JWriteResFpuPair<WriteFMul, [JFPU1, JFPM], 2>; +defm : JWriteResFpuPair<WriteFMulX, [JFPU1, JFPM], 2>; +defm : JWriteResYMMPair<WriteFMulY, [JFPU1, JFPM], 2, [2,2], 2>; +defm : X86WriteResPairUnsupported<WriteFMulZ>; +defm : JWriteResFpuPair<WriteFMul64, [JFPU1, JFPM], 4, [1,2]>; +defm : JWriteResFpuPair<WriteFMul64X, [JFPU1, JFPM], 4, [1,2]>; +defm : JWriteResYMMPair<WriteFMul64Y, [JFPU1, JFPM], 4, [2,4], 2>; +defm : X86WriteResPairUnsupported<WriteFMul64Z>; +defm : X86WriteResPairUnsupported<WriteFMA>; +defm : X86WriteResPairUnsupported<WriteFMAX>; +defm : X86WriteResPairUnsupported<WriteFMAY>; +defm : X86WriteResPairUnsupported<WriteFMAZ>; +defm : JWriteResFpuPair<WriteDPPD, [JFPU1, JFPM, JFPA], 9, [1, 3, 3], 3>; +defm : JWriteResFpuPair<WriteDPPS, [JFPU1, JFPM, JFPA], 11, [1, 3, 3], 5>; +defm : JWriteResYMMPair<WriteDPPSY, [JFPU1, JFPM, JFPA], 12, [2, 6, 6], 10>; +defm : X86WriteResPairUnsupported<WriteDPPSZ>; +defm : JWriteResFpuPair<WriteFRcp, [JFPU1, JFPM], 2>; +defm : JWriteResFpuPair<WriteFRcpX, [JFPU1, JFPM], 2>; +defm : JWriteResYMMPair<WriteFRcpY, [JFPU1, JFPM], 2, [2,2], 2>; +defm : X86WriteResPairUnsupported<WriteFRcpZ>; +defm : JWriteResFpuPair<WriteFRsqrt, [JFPU1, JFPM], 2>; +defm : JWriteResFpuPair<WriteFRsqrtX, [JFPU1, JFPM], 2>; +defm : JWriteResYMMPair<WriteFRsqrtY, [JFPU1, JFPM], 2, [2,2], 2>; +defm : X86WriteResPairUnsupported<WriteFRsqrtZ>; +defm : JWriteResFpuPair<WriteFDiv, [JFPU1, JFPM], 19, [1, 19]>; +defm : JWriteResFpuPair<WriteFDivX, [JFPU1, JFPM], 19, [1, 19]>; +defm : JWriteResYMMPair<WriteFDivY, [JFPU1, JFPM], 38, [2, 38], 2>; +defm : X86WriteResPairUnsupported<WriteFDivZ>; +defm : JWriteResFpuPair<WriteFDiv64, [JFPU1, JFPM], 19, [1, 19]>; +defm : JWriteResFpuPair<WriteFDiv64X, [JFPU1, JFPM], 19, [1, 19]>; +defm : JWriteResYMMPair<WriteFDiv64Y, [JFPU1, JFPM], 38, [2, 38], 2>; +defm : X86WriteResPairUnsupported<WriteFDiv64Z>; +defm : JWriteResFpuPair<WriteFSqrt, [JFPU1, JFPM], 21, [1, 21]>; +defm : JWriteResFpuPair<WriteFSqrtX, [JFPU1, JFPM], 21, [1, 21]>; +defm : JWriteResYMMPair<WriteFSqrtY, [JFPU1, JFPM], 42, [2, 42], 2>; +defm : X86WriteResPairUnsupported<WriteFSqrtZ>; +defm : JWriteResFpuPair<WriteFSqrt64, [JFPU1, JFPM], 27, [1, 27]>; +defm : JWriteResFpuPair<WriteFSqrt64X, [JFPU1, JFPM], 27, [1, 27]>; +defm : JWriteResYMMPair<WriteFSqrt64Y, [JFPU1, JFPM], 54, [2, 54], 2>; +defm : X86WriteResPairUnsupported<WriteFSqrt64Z>; +defm : JWriteResFpuPair<WriteFSqrt80, [JFPU1, JFPM], 35, [1, 35]>; +defm : JWriteResFpuPair<WriteFSign, [JFPU1, JFPM], 2>; +defm : JWriteResFpuPair<WriteFRnd, [JFPU1, JSTC], 3>; +defm : JWriteResYMMPair<WriteFRndY, [JFPU1, JSTC], 3, [2,2], 2>; +defm : X86WriteResPairUnsupported<WriteFRndZ>; +defm : JWriteResFpuPair<WriteFLogic, [JFPU01, JFPX], 1>; +defm : JWriteResYMMPair<WriteFLogicY, [JFPU01, JFPX], 1, [2, 2], 2>; +defm : X86WriteResPairUnsupported<WriteFLogicZ>; +defm : JWriteResFpuPair<WriteFTest, [JFPU0, JFPA, JALU0], 3>; +defm : JWriteResYMMPair<WriteFTestY , [JFPU01, JFPX, JFPA, JALU0], 4, [2, 2, 2, 1], 3>; +defm : X86WriteResPairUnsupported<WriteFTestZ>; +defm : JWriteResFpuPair<WriteFShuffle, [JFPU01, JFPX], 1>; +defm : JWriteResYMMPair<WriteFShuffleY, [JFPU01, JFPX], 1, [2, 2], 2>; +defm : X86WriteResPairUnsupported<WriteFShuffleZ>; +defm : JWriteResFpuPair<WriteFVarShuffle, [JFPU01, JFPX], 2, [1, 4], 3>; +defm : JWriteResYMMPair<WriteFVarShuffleY,[JFPU01, JFPX], 3, [2, 6], 6>; +defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>; +defm : JWriteResFpuPair<WriteFBlend, [JFPU01, JFPX], 1>; +defm : JWriteResYMMPair<WriteFBlendY, [JFPU01, JFPX], 1, [2, 2], 2>; +defm : X86WriteResPairUnsupported<WriteFBlendZ>; +defm : JWriteResFpuPair<WriteFVarBlend, [JFPU01, JFPX], 2, [1, 4], 3>; +defm : JWriteResYMMPair<WriteFVarBlendY, [JFPU01, JFPX], 3, [2, 6], 6>; +defm : X86WriteResPairUnsupported<WriteFVarBlendZ>; +defm : JWriteResFpuPair<WriteFShuffle256, [JFPU01, JFPX], 1>; +defm : X86WriteResPairUnsupported<WriteFVarShuffle256>; + +//////////////////////////////////////////////////////////////////////////////// +// Conversions. +//////////////////////////////////////////////////////////////////////////////// + +defm : JWriteResFpuPair<WriteCvtSS2I, [JFPU1, JSTC, JFPA, JALU0], 7, [1,1,1,1], 2>; +defm : JWriteResFpuPair<WriteCvtPS2I, [JFPU1, JSTC], 3, [1,1], 1>; +defm : JWriteResYMMPair<WriteCvtPS2IY, [JFPU1, JSTC], 3, [2,2], 2>; +defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>; +defm : JWriteResFpuPair<WriteCvtSD2I, [JFPU1, JSTC, JFPA, JALU0], 7, [1,1,1,1], 2>; +defm : JWriteResFpuPair<WriteCvtPD2I, [JFPU1, JSTC], 3, [1,1], 1>; +defm : JWriteResYMMPair<WriteCvtPD2IY, [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>; +defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>; + +// FIXME: f+3 ST, LD+STC latency +defm : JWriteResFpuPair<WriteCvtI2SS, [JFPU1, JSTC], 9, [1,1], 2>; +defm : JWriteResFpuPair<WriteCvtI2PS, [JFPU1, JSTC], 3, [1,1], 1>; +defm : JWriteResYMMPair<WriteCvtI2PSY, [JFPU1, JSTC], 3, [2,2], 2>; +defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>; +defm : JWriteResFpuPair<WriteCvtI2SD, [JFPU1, JSTC], 9, [1,1], 2>; +defm : JWriteResFpuPair<WriteCvtI2PD, [JFPU1, JSTC], 3, [1,1], 1>; +defm : JWriteResYMMPair<WriteCvtI2PDY, [JFPU1, JSTC], 3, [2,2], 2>; +defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>; + +defm : JWriteResFpuPair<WriteCvtSS2SD, [JFPU1, JSTC], 7, [1,2], 2>; +defm : JWriteResFpuPair<WriteCvtPS2PD, [JFPU1, JSTC], 2, [1,1], 1>; +defm : JWriteResYMMPair<WriteCvtPS2PDY, [JFPU1, JSTC], 2, [2,2], 2>; +defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>; + +defm : JWriteResFpuPair<WriteCvtSD2SS, [JFPU1, JSTC], 7, [1,2], 2>; +defm : JWriteResFpuPair<WriteCvtPD2PS, [JFPU1, JSTC], 3, [1,1], 1>; +defm : JWriteResYMMPair<WriteCvtPD2PSY, [JFPU1, JSTC, JFPX], 6, [2,2,4], 3>; +defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>; + +defm : JWriteResFpuPair<WriteCvtPH2PS, [JFPU1, JSTC], 3, [1,1], 1>; +defm : JWriteResYMMPair<WriteCvtPH2PSY, [JFPU1, JSTC], 3, [2,2], 2>; +defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>; + +defm : X86WriteRes<WriteCvtPS2PH, [JFPU1, JSTC], 3, [1,1], 1>; +defm : X86WriteRes<WriteCvtPS2PHY, [JFPU1, JSTC, JFPX], 6, [2,2,2], 3>; +defm : X86WriteResUnsupported<WriteCvtPS2PHZ>; +defm : X86WriteRes<WriteCvtPS2PHSt, [JFPU1, JSTC, JSAGU], 4, [1,1,1], 1>; +defm : X86WriteRes<WriteCvtPS2PHYSt, [JFPU1, JSTC, JFPX, JSAGU], 7, [2,2,2,1], 3>; +defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>; + +//////////////////////////////////////////////////////////////////////////////// +// Vector integer operations. +//////////////////////////////////////////////////////////////////////////////// + +defm : X86WriteRes<WriteVecLoad, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>; +defm : X86WriteRes<WriteVecLoadX, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>; +defm : X86WriteRes<WriteVecLoadY, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>; +defm : X86WriteRes<WriteVecLoadNT, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>; +defm : X86WriteRes<WriteVecLoadNTY, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>; +defm : X86WriteRes<WriteVecMaskedLoad, [JLAGU, JFPU01, JVALU], 6, [1, 1, 2], 1>; +defm : X86WriteRes<WriteVecMaskedLoadY, [JLAGU, JFPU01, JVALU], 6, [2, 2, 4], 2>; + +defm : X86WriteRes<WriteVecStore, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>; +defm : X86WriteRes<WriteVecStoreX, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>; +defm : X86WriteRes<WriteVecStoreY, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>; +defm : X86WriteRes<WriteVecStoreNT, [JSAGU, JFPU1, JSTC], 2, [1, 1, 1], 1>; +defm : X86WriteRes<WriteVecStoreNTY, [JSAGU, JFPU1, JSTC], 2, [2, 2, 2], 1>; +defm : X86WriteRes<WriteVecMaskedStore, [JSAGU, JFPU01, JVALU], 6, [1, 1, 4], 1>; +defm : X86WriteRes<WriteVecMaskedStoreY, [JSAGU, JFPU01, JVALU], 6, [2, 2, 4], 2>; + +defm : X86WriteRes<WriteVecMove, [JFPU01, JVALU], 1, [1, 1], 1>; +defm : X86WriteRes<WriteVecMoveX, [JFPU01, JVALU], 1, [1, 1], 1>; +defm : X86WriteRes<WriteVecMoveY, [JFPU01, JVALU], 1, [2, 2], 2>; +defm : X86WriteRes<WriteVecMoveToGpr, [JFPU0, JFPA, JALU0], 4, [1, 1, 1], 1>; +defm : X86WriteRes<WriteVecMoveFromGpr, [JFPU01, JFPX], 8, [1, 1], 2>; + +defm : JWriteResFpuPair<WriteVecALU, [JFPU01, JVALU], 1>; +defm : JWriteResFpuPair<WriteVecALUX, [JFPU01, JVALU], 1>; +defm : X86WriteResPairUnsupported<WriteVecALUY>; +defm : X86WriteResPairUnsupported<WriteVecALUZ>; +defm : JWriteResFpuPair<WriteVecShift, [JFPU01, JVALU], 1>; +defm : JWriteResFpuPair<WriteVecShiftX, [JFPU01, JVALU], 1>; +defm : X86WriteResPairUnsupported<WriteVecShiftY>; +defm : X86WriteResPairUnsupported<WriteVecShiftZ>; +defm : JWriteResFpuPair<WriteVecShiftImm, [JFPU01, JVALU], 1>; +defm : JWriteResFpuPair<WriteVecShiftImmX,[JFPU01, JVALU], 1>; +defm : X86WriteResPairUnsupported<WriteVecShiftImmY>; +defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>; +defm : X86WriteResPairUnsupported<WriteVarVecShift>; +defm : X86WriteResPairUnsupported<WriteVarVecShiftY>; +defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>; +defm : JWriteResFpuPair<WriteVecIMul, [JFPU0, JVIMUL], 2>; +defm : JWriteResFpuPair<WriteVecIMulX, [JFPU0, JVIMUL], 2>; +defm : X86WriteResPairUnsupported<WriteVecIMulY>; +defm : X86WriteResPairUnsupported<WriteVecIMulZ>; +defm : JWriteResFpuPair<WritePMULLD, [JFPU0, JFPU01, JVIMUL, JVALU], 4, [2, 1, 2, 1], 3>; +defm : X86WriteResPairUnsupported<WritePMULLDY>; +defm : X86WriteResPairUnsupported<WritePMULLDZ>; +defm : JWriteResFpuPair<WriteMPSAD, [JFPU0, JVIMUL], 3, [1, 2]>; +defm : X86WriteResPairUnsupported<WriteMPSADY>; +defm : X86WriteResPairUnsupported<WriteMPSADZ>; +defm : JWriteResFpuPair<WritePSADBW, [JFPU01, JVALU], 2>; +defm : JWriteResFpuPair<WritePSADBWX, [JFPU01, JVALU], 2>; +defm : X86WriteResPairUnsupported<WritePSADBWY>; +defm : X86WriteResPairUnsupported<WritePSADBWZ>; +defm : JWriteResFpuPair<WritePHMINPOS, [JFPU0, JVALU], 2>; +defm : JWriteResFpuPair<WriteShuffle, [JFPU01, JVALU], 1>; +defm : JWriteResFpuPair<WriteShuffleX, [JFPU01, JVALU], 1>; +defm : X86WriteResPairUnsupported<WriteShuffleY>; +defm : X86WriteResPairUnsupported<WriteShuffleZ>; +defm : JWriteResFpuPair<WriteVarShuffle, [JFPU01, JVALU], 2, [1, 4], 3>; +defm : JWriteResFpuPair<WriteVarShuffleX, [JFPU01, JVALU], 2, [1, 4], 3>; +defm : X86WriteResPairUnsupported<WriteVarShuffleY>; +defm : X86WriteResPairUnsupported<WriteVarShuffleZ>; +defm : JWriteResFpuPair<WriteBlend, [JFPU01, JVALU], 1>; +defm : X86WriteResPairUnsupported<WriteBlendY>; +defm : X86WriteResPairUnsupported<WriteBlendZ>; +defm : JWriteResFpuPair<WriteVarBlend, [JFPU01, JVALU], 2, [1, 4], 3>; +defm : X86WriteResPairUnsupported<WriteVarBlendY>; +defm : X86WriteResPairUnsupported<WriteVarBlendZ>; +defm : JWriteResFpuPair<WriteVecLogic, [JFPU01, JVALU], 1>; +defm : JWriteResFpuPair<WriteVecLogicX, [JFPU01, JVALU], 1>; +defm : X86WriteResPairUnsupported<WriteVecLogicY>; +defm : X86WriteResPairUnsupported<WriteVecLogicZ>; +defm : JWriteResFpuPair<WriteVecTest, [JFPU0, JFPA, JALU0], 3>; +defm : JWriteResYMMPair<WriteVecTestY, [JFPU01, JFPX, JFPA, JALU0], 4, [2, 2, 2, 1], 3>; +defm : X86WriteResPairUnsupported<WriteVecTestZ>; +defm : X86WriteResPairUnsupported<WriteShuffle256>; +defm : X86WriteResPairUnsupported<WriteVarShuffle256>; + +//////////////////////////////////////////////////////////////////////////////// +// Vector insert/extract operations. +//////////////////////////////////////////////////////////////////////////////// + +defm : X86WriteRes<WriteVecInsert, [JFPU01, JVALU], 7, [1,1], 2>; +defm : X86WriteRes<WriteVecInsertLd, [JFPU01, JVALU, JLAGU], 4, [1,1,1], 1>; +defm : X86WriteRes<WriteVecExtract, [JFPU0, JFPA, JALU0], 3, [1,1,1], 1>; +defm : X86WriteRes<WriteVecExtractSt, [JFPU1, JSTC, JSAGU], 3, [1,1,1], 1>; + +//////////////////////////////////////////////////////////////////////////////// +// SSE42 String instructions. +//////////////////////////////////////////////////////////////////////////////// + +defm : JWriteResFpuPair<WritePCmpIStrI, [JFPU1, JVALU1, JFPA, JALU0], 7, [1, 2, 1, 1], 3>; +defm : JWriteResFpuPair<WritePCmpIStrM, [JFPU1, JVALU1, JFPA, JALU0], 8, [1, 2, 1, 1], 3>; +defm : JWriteResFpuPair<WritePCmpEStrI, [JFPU1, JSAGU, JLAGU, JVALU, JVALU1, JFPA, JALU0], 14, [1, 2, 2, 6, 4, 1, 1], 9>; +defm : JWriteResFpuPair<WritePCmpEStrM, [JFPU1, JSAGU, JLAGU, JVALU, JVALU1, JFPA, JALU0], 14, [1, 2, 2, 6, 4, 1, 1], 9>; + +//////////////////////////////////////////////////////////////////////////////// +// MOVMSK Instructions. +//////////////////////////////////////////////////////////////////////////////// + +def : WriteRes<WriteFMOVMSK, [JFPU0, JFPA, JALU0]> { let Latency = 3; } +def : WriteRes<WriteVecMOVMSK, [JFPU0, JFPA, JALU0]> { let Latency = 3; } +defm : X86WriteResUnsupported<WriteVecMOVMSKY>; +def : WriteRes<WriteMMXMOVMSK, [JFPU0, JFPA, JALU0]> { let Latency = 3; } + +//////////////////////////////////////////////////////////////////////////////// +// AES Instructions. +//////////////////////////////////////////////////////////////////////////////// + +defm : JWriteResFpuPair<WriteAESIMC, [JFPU0, JVIMUL], 2>; +defm : JWriteResFpuPair<WriteAESKeyGen, [JFPU0, JVIMUL], 2>; +defm : JWriteResFpuPair<WriteAESDecEnc, [JFPU0, JVIMUL], 3, [1, 1], 2>; + +//////////////////////////////////////////////////////////////////////////////// +// Horizontal add/sub instructions. +//////////////////////////////////////////////////////////////////////////////// + +defm : JWriteResFpuPair<WriteFHAdd, [JFPU0, JFPA], 3>; +defm : JWriteResYMMPair<WriteFHAddY, [JFPU0, JFPA], 3, [2,2], 2>; +defm : JWriteResFpuPair<WritePHAdd, [JFPU01, JVALU], 1>; +defm : JWriteResFpuPair<WritePHAddX, [JFPU01, JVALU], 1>; +defm : X86WriteResPairUnsupported<WritePHAddY>; + +//////////////////////////////////////////////////////////////////////////////// +// Carry-less multiplication instructions. +//////////////////////////////////////////////////////////////////////////////// + +defm : JWriteResFpuPair<WriteCLMul, [JFPU0, JVIMUL], 2>; + +//////////////////////////////////////////////////////////////////////////////// +// SSE4A instructions. +//////////////////////////////////////////////////////////////////////////////// + +def JWriteINSERTQ: SchedWriteRes<[JFPU01, JVALU]> { + let Latency = 2; + let ResourceCycles = [1, 4]; +} +def : InstRW<[JWriteINSERTQ], (instrs INSERTQ, INSERTQI)>; + +//////////////////////////////////////////////////////////////////////////////// +// AVX instructions. +//////////////////////////////////////////////////////////////////////////////// + +def JWriteVBROADCASTYLd: SchedWriteRes<[JLAGU, JFPU01, JFPX]> { + let Latency = 6; + let ResourceCycles = [1, 2, 4]; + let NumMicroOps = 2; +} +def : InstRW<[JWriteVBROADCASTYLd, ReadAfterLd], (instrs VBROADCASTSDYrm, + VBROADCASTSSYrm)>; + +def JWriteJVZEROALL: SchedWriteRes<[]> { + let Latency = 90; + let NumMicroOps = 73; +} +def : InstRW<[JWriteJVZEROALL], (instrs VZEROALL)>; + +def JWriteJVZEROUPPER: SchedWriteRes<[]> { + let Latency = 46; + let NumMicroOps = 37; +} +def : InstRW<[JWriteJVZEROUPPER], (instrs VZEROUPPER)>; + +/////////////////////////////////////////////////////////////////////////////// +// SchedWriteVariant definitions. +/////////////////////////////////////////////////////////////////////////////// + +def JWriteZeroLatency : SchedWriteRes<[]> { + let Latency = 0; +} + +// Certain instructions that use the same register for both source +// operands do not have a real dependency on the previous contents of the +// register, and thus, do not have to wait before completing. They can be +// optimized out at register renaming stage. +// Reference: Section 10.8 of the "Software Optimization Guide for AMD Family +// 15h Processors". +// Reference: Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs", +// Section 21.8 [Dependency-breaking instructions]. + +def JWriteZeroIdiom : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, + SchedVar<MCSchedPredicate<TruePred>, [WriteALU]> +]>; +def : InstRW<[JWriteZeroIdiom], (instrs SUB32rr, SUB64rr, + XOR32rr, XOR64rr)>; + +def JWriteFZeroIdiom : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, + SchedVar<MCSchedPredicate<TruePred>, [WriteFLogic]> +]>; +def : InstRW<[JWriteFZeroIdiom], (instrs XORPSrr, VXORPSrr, XORPDrr, VXORPDrr, + ANDNPSrr, VANDNPSrr, + ANDNPDrr, VANDNPDrr)>; + +def JWriteVZeroIdiomLogic : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, + SchedVar<MCSchedPredicate<TruePred>, [WriteVecLogic]> +]>; +def : InstRW<[JWriteVZeroIdiomLogic], (instrs MMX_PXORirr, MMX_PANDNirr)>; + +def JWriteVZeroIdiomLogicX : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, + SchedVar<MCSchedPredicate<TruePred>, [WriteVecLogicX]> +]>; +def : InstRW<[JWriteVZeroIdiomLogicX], (instrs PXORrr, VPXORrr, + PANDNrr, VPANDNrr)>; + +def JWriteVZeroIdiomALU : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, + SchedVar<MCSchedPredicate<TruePred>, [WriteVecALU]> +]>; +def : InstRW<[JWriteVZeroIdiomALU], (instrs MMX_PSUBBirr, MMX_PSUBDirr, + MMX_PSUBQirr, MMX_PSUBWirr, + MMX_PCMPGTBirr, MMX_PCMPGTDirr, + MMX_PCMPGTWirr)>; + +def JWriteVZeroIdiomALUX : SchedWriteVariant<[ + SchedVar<MCSchedPredicate<ZeroIdiomPredicate>, [JWriteZeroLatency]>, + SchedVar<MCSchedPredicate<TruePred>, [WriteVecALUX]> +]>; +def : InstRW<[JWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr, + PSUBDrr, VPSUBDrr, + PSUBQrr, VPSUBQrr, + PSUBWrr, VPSUBWrr, + PCMPGTBrr, VPCMPGTBrr, + PCMPGTDrr, VPCMPGTDrr, + PCMPGTQrr, VPCMPGTQrr, + PCMPGTWrr, VPCMPGTWrr)>; + +// This write is used for slow LEA instructions. +def JWrite3OpsLEA : SchedWriteRes<[JALU1, JSAGU]> { + let Latency = 2; +} + +// On Jaguar, a slow LEA is either a 3Ops LEA (base, index, offset), or an LEA +// with a `Scale` value different than 1. +def JSlowLEAPredicate : MCSchedPredicate< + CheckAny<[ + // A 3-operand LEA (base, index, offset). + IsThreeOperandsLEAFn, + // An LEA with a "Scale" different than 1. + CheckAll<[ + CheckIsImmOperand<2>, + CheckNot<CheckImmOperand<2, 1>> + ]> + ]> +>; + +def JWriteLEA : SchedWriteVariant<[ + SchedVar<JSlowLEAPredicate, [JWrite3OpsLEA]>, + SchedVar<MCSchedPredicate<TruePred>, [WriteLEA]> +]>; + +def : InstRW<[JWriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>; + +def JSlowLEA16r : SchedWriteRes<[JALU01]> { + let Latency = 3; + let ResourceCycles = [4]; +} + +def : InstRW<[JSlowLEA16r], (instrs LEA16r)>; + +} // SchedModel diff --git a/capstone/suite/synctools/tablegen/X86/back/X86ScheduleSLM.td b/capstone/suite/synctools/tablegen/X86/back/X86ScheduleSLM.td new file mode 100644 index 000000000..b1e843013 --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86ScheduleSLM.td @@ -0,0 +1,486 @@ +//=- X86ScheduleSLM.td - X86 Silvermont Scheduling -----------*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for Intel Silvermont to support +// instruction scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +def SLMModel : SchedMachineModel { + // All x86 instructions are modeled as a single micro-op, and SLM can decode 2 + // instructions per cycle. + let IssueWidth = 2; + let MicroOpBufferSize = 32; // Based on the reorder buffer. + let LoadLatency = 3; + let MispredictPenalty = 10; + let PostRAScheduler = 1; + + // For small loops, expand by a small factor to hide the backedge cost. + let LoopMicroOpBufferSize = 10; + + // FIXME: SSE4 is unimplemented. This flag is set to allow + // the scheduler to assign a default model to unrecognized opcodes. + let CompleteModel = 0; +} + +let SchedModel = SLMModel in { + +// Silvermont has 5 reservation stations for micro-ops +def SLM_IEC_RSV0 : ProcResource<1>; +def SLM_IEC_RSV1 : ProcResource<1>; +def SLM_FPC_RSV0 : ProcResource<1> { let BufferSize = 1; } +def SLM_FPC_RSV1 : ProcResource<1> { let BufferSize = 1; } +def SLM_MEC_RSV : ProcResource<1>; + +// Many micro-ops are capable of issuing on multiple ports. +def SLM_IEC_RSV01 : ProcResGroup<[SLM_IEC_RSV0, SLM_IEC_RSV1]>; +def SLM_FPC_RSV01 : ProcResGroup<[SLM_FPC_RSV0, SLM_FPC_RSV1]>; + +def SLMDivider : ProcResource<1>; +def SLMFPMultiplier : ProcResource<1>; +def SLMFPDivider : ProcResource<1>; + +// Loads are 3 cycles, so ReadAfterLd registers needn't be available until 3 +// cycles after the memory operand. +def : ReadAdvance<ReadAfterLd, 3>; + +// Many SchedWrites are defined in pairs with and without a folded load. +// Instructions with folded loads are usually micro-fused, so they only appear +// as two micro-ops when queued in the reservation station. +// This multiclass defines the resource usage for variants with and without +// folded loads. +multiclass SLMWriteResPair<X86FoldableSchedWrite SchedRW, + list<ProcResourceKind> ExePorts, + int Lat, list<int> Res = [1], int UOps = 1, + int LoadLat = 3> { + // Register variant is using a single cycle on ExePort. + def : WriteRes<SchedRW, ExePorts> { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } + + // Memory variant also uses a cycle on MEC_RSV and adds LoadLat cycles to + // the latency (default = 3). + def : WriteRes<SchedRW.Folded, !listconcat([SLM_MEC_RSV], ExePorts)> { + let Latency = !add(Lat, LoadLat); + let ResourceCycles = !listconcat([1], Res); + let NumMicroOps = UOps; + } +} + +// A folded store needs a cycle on MEC_RSV for the store data, but it does not +// need an extra port cycle to recompute the address. +def : WriteRes<WriteRMW, [SLM_MEC_RSV]>; + +def : WriteRes<WriteStore, [SLM_IEC_RSV01, SLM_MEC_RSV]>; +def : WriteRes<WriteStoreNT, [SLM_IEC_RSV01, SLM_MEC_RSV]>; +def : WriteRes<WriteLoad, [SLM_MEC_RSV]> { let Latency = 3; } +def : WriteRes<WriteMove, [SLM_IEC_RSV01]>; +def : WriteRes<WriteZero, []>; + +// Load/store MXCSR. +// FIXME: These are probably wrong. They are copy pasted from WriteStore/Load. +def : WriteRes<WriteSTMXCSR, [SLM_IEC_RSV01, SLM_MEC_RSV]>; +def : WriteRes<WriteLDMXCSR, [SLM_MEC_RSV]> { let Latency = 3; } + +// Treat misc copies as a move. +def : InstRW<[WriteMove], (instrs COPY)>; + +defm : SLMWriteResPair<WriteALU, [SLM_IEC_RSV01], 1>; +defm : SLMWriteResPair<WriteADC, [SLM_IEC_RSV01], 1>; +defm : SLMWriteResPair<WriteIMul, [SLM_IEC_RSV1], 3>; +defm : SLMWriteResPair<WriteIMul64, [SLM_IEC_RSV1], 3>; + +defm : X86WriteRes<WriteBSWAP32, [SLM_IEC_RSV01], 1, [1], 1>; +defm : X86WriteRes<WriteBSWAP64, [SLM_IEC_RSV01], 1, [1], 1>; + +defm : SLMWriteResPair<WriteShift, [SLM_IEC_RSV0], 1>; + +defm : X86WriteRes<WriteSHDrri, [SLM_IEC_RSV0], 1, [1], 1>; +defm : X86WriteRes<WriteSHDrrcl,[SLM_IEC_RSV0], 1, [1], 1>; +defm : X86WriteRes<WriteSHDmri, [SLM_MEC_RSV, SLM_IEC_RSV0], 4, [2, 1], 2>; +defm : X86WriteRes<WriteSHDmrcl,[SLM_MEC_RSV, SLM_IEC_RSV0], 4, [2, 1], 2>; + +defm : SLMWriteResPair<WriteJump, [SLM_IEC_RSV1], 1>; +defm : SLMWriteResPair<WriteCRC32, [SLM_IEC_RSV1], 3>; + +defm : SLMWriteResPair<WriteCMOV, [SLM_IEC_RSV01], 2, [2]>; +defm : SLMWriteResPair<WriteCMOV2, [SLM_IEC_RSV01], 2, [2]>; +defm : X86WriteRes<WriteFCMOV, [SLM_FPC_RSV1], 3, [1], 1>; // x87 conditional move. +def : WriteRes<WriteSETCC, [SLM_IEC_RSV01]>; +def : WriteRes<WriteSETCCStore, [SLM_IEC_RSV01, SLM_MEC_RSV]> { + // FIXME Latency and NumMicrOps? + let ResourceCycles = [2,1]; +} +def : WriteRes<WriteLAHFSAHF, [SLM_IEC_RSV01]>; +def : WriteRes<WriteBitTest,[SLM_IEC_RSV01]>; + +// This is for simple LEAs with one or two input operands. +// The complex ones can only execute on port 1, and they require two cycles on +// the port to read all inputs. We don't model that. +def : WriteRes<WriteLEA, [SLM_IEC_RSV1]>; + +// Bit counts. +defm : SLMWriteResPair<WriteBSF, [SLM_IEC_RSV01], 10, [20], 10>; +defm : SLMWriteResPair<WriteBSR, [SLM_IEC_RSV01], 10, [20], 10>; +defm : SLMWriteResPair<WriteLZCNT, [SLM_IEC_RSV0], 3>; +defm : SLMWriteResPair<WriteTZCNT, [SLM_IEC_RSV0], 3>; +defm : SLMWriteResPair<WritePOPCNT, [SLM_IEC_RSV0], 3>; + +// BMI1 BEXTR, BMI2 BZHI +defm : X86WriteResPairUnsupported<WriteBEXTR>; +defm : X86WriteResPairUnsupported<WriteBZHI>; + +defm : SLMWriteResPair<WriteDiv8, [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 4>; +defm : SLMWriteResPair<WriteDiv16, [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 4>; +defm : SLMWriteResPair<WriteDiv32, [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 4>; +defm : SLMWriteResPair<WriteDiv64, [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 4>; +defm : SLMWriteResPair<WriteIDiv8, [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 4>; +defm : SLMWriteResPair<WriteIDiv16, [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 4>; +defm : SLMWriteResPair<WriteIDiv32, [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 4>; +defm : SLMWriteResPair<WriteIDiv64, [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, 4>; + +// Scalar and vector floating point. +defm : X86WriteRes<WriteFLD0, [SLM_FPC_RSV01], 1, [1], 1>; +defm : X86WriteRes<WriteFLD1, [SLM_FPC_RSV01], 1, [1], 1>; +defm : X86WriteRes<WriteFLDC, [SLM_FPC_RSV01], 1, [2], 2>; +def : WriteRes<WriteFLoad, [SLM_MEC_RSV]> { let Latency = 3; } +def : WriteRes<WriteFLoadX, [SLM_MEC_RSV]> { let Latency = 3; } +def : WriteRes<WriteFLoadY, [SLM_MEC_RSV]> { let Latency = 3; } +def : WriteRes<WriteFMaskedLoad, [SLM_MEC_RSV]> { let Latency = 3; } +def : WriteRes<WriteFMaskedLoadY, [SLM_MEC_RSV]> { let Latency = 3; } +def : WriteRes<WriteFStore, [SLM_MEC_RSV]>; +def : WriteRes<WriteFStoreX, [SLM_MEC_RSV]>; +def : WriteRes<WriteFStoreY, [SLM_MEC_RSV]>; +def : WriteRes<WriteFStoreNT, [SLM_MEC_RSV]>; +def : WriteRes<WriteFStoreNTX, [SLM_MEC_RSV]>; +def : WriteRes<WriteFStoreNTY, [SLM_MEC_RSV]>; +def : WriteRes<WriteFMaskedStore, [SLM_MEC_RSV]>; +def : WriteRes<WriteFMaskedStoreY, [SLM_MEC_RSV]>; +def : WriteRes<WriteFMove, [SLM_FPC_RSV01]>; +def : WriteRes<WriteFMoveX, [SLM_FPC_RSV01]>; +def : WriteRes<WriteFMoveY, [SLM_FPC_RSV01]>; +defm : X86WriteRes<WriteEMMS, [SLM_FPC_RSV01], 10, [10], 9>; + +defm : SLMWriteResPair<WriteFAdd, [SLM_FPC_RSV1], 3>; +defm : SLMWriteResPair<WriteFAddX, [SLM_FPC_RSV1], 3>; +defm : SLMWriteResPair<WriteFAddY, [SLM_FPC_RSV1], 3>; +defm : X86WriteResPairUnsupported<WriteFAddZ>; +defm : SLMWriteResPair<WriteFAdd64, [SLM_FPC_RSV1], 3>; +defm : SLMWriteResPair<WriteFAdd64X, [SLM_FPC_RSV1], 3>; +defm : SLMWriteResPair<WriteFAdd64Y, [SLM_FPC_RSV1], 3>; +defm : X86WriteResPairUnsupported<WriteFAdd64Z>; +defm : SLMWriteResPair<WriteFCmp, [SLM_FPC_RSV1], 3>; +defm : SLMWriteResPair<WriteFCmpX, [SLM_FPC_RSV1], 3>; +defm : SLMWriteResPair<WriteFCmpY, [SLM_FPC_RSV1], 3>; +defm : X86WriteResPairUnsupported<WriteFCmpZ>; +defm : SLMWriteResPair<WriteFCmp64, [SLM_FPC_RSV1], 3>; +defm : SLMWriteResPair<WriteFCmp64X, [SLM_FPC_RSV1], 3>; +defm : SLMWriteResPair<WriteFCmp64Y, [SLM_FPC_RSV1], 3>; +defm : X86WriteResPairUnsupported<WriteFCmp64Z>; +defm : SLMWriteResPair<WriteFCom, [SLM_FPC_RSV1], 3>; +defm : SLMWriteResPair<WriteFMul, [SLM_FPC_RSV0, SLMFPMultiplier], 5, [1,2]>; +defm : SLMWriteResPair<WriteFMulX, [SLM_FPC_RSV0, SLMFPMultiplier], 5, [1,2]>; +defm : SLMWriteResPair<WriteFMulY, [SLM_FPC_RSV0, SLMFPMultiplier], 5, [1,2]>; +defm : X86WriteResPairUnsupported<WriteFMulZ>; +defm : SLMWriteResPair<WriteFMul64, [SLM_FPC_RSV0, SLMFPMultiplier], 5, [1,2]>; +defm : SLMWriteResPair<WriteFMul64X, [SLM_FPC_RSV0, SLMFPMultiplier], 5, [1,2]>; +defm : SLMWriteResPair<WriteFMul64Y, [SLM_FPC_RSV0, SLMFPMultiplier], 5, [1,2]>; +defm : X86WriteResPairUnsupported<WriteFMul64Z>; +defm : SLMWriteResPair<WriteFDiv, [SLM_FPC_RSV0, SLMFPDivider], 19, [1,17]>; +defm : SLMWriteResPair<WriteFDivX, [SLM_FPC_RSV0, SLMFPDivider], 39, [1,39]>; +defm : SLMWriteResPair<WriteFDivY, [SLM_FPC_RSV0, SLMFPDivider], 39, [1,39]>; +defm : X86WriteResPairUnsupported<WriteFDivZ>; +defm : SLMWriteResPair<WriteFDiv64, [SLM_FPC_RSV0, SLMFPDivider], 34, [1,32]>; +defm : SLMWriteResPair<WriteFDiv64X, [SLM_FPC_RSV0, SLMFPDivider], 69, [1,69]>; +defm : SLMWriteResPair<WriteFDiv64Y, [SLM_FPC_RSV0, SLMFPDivider], 69, [1,69]>; +defm : X86WriteResPairUnsupported<WriteFDiv64Z>; +defm : SLMWriteResPair<WriteFRcp, [SLM_FPC_RSV0], 5>; +defm : SLMWriteResPair<WriteFRcpX, [SLM_FPC_RSV0], 5>; +defm : SLMWriteResPair<WriteFRcpY, [SLM_FPC_RSV0], 5>; +defm : X86WriteResPairUnsupported<WriteFRcpZ>; +defm : SLMWriteResPair<WriteFRsqrt, [SLM_FPC_RSV0], 5>; +defm : SLMWriteResPair<WriteFRsqrtX, [SLM_FPC_RSV0], 5>; +defm : SLMWriteResPair<WriteFRsqrtY, [SLM_FPC_RSV0], 5>; +defm : X86WriteResPairUnsupported<WriteFRsqrtZ>; +defm : SLMWriteResPair<WriteFSqrt, [SLM_FPC_RSV0,SLMFPDivider], 20, [1,20], 1, 3>; +defm : SLMWriteResPair<WriteFSqrtX, [SLM_FPC_RSV0,SLMFPDivider], 41, [1,40], 1, 3>; +defm : SLMWriteResPair<WriteFSqrtY, [SLM_FPC_RSV0,SLMFPDivider], 41, [1,40], 1, 3>; +defm : X86WriteResPairUnsupported<WriteFSqrtZ>; +defm : SLMWriteResPair<WriteFSqrt64, [SLM_FPC_RSV0,SLMFPDivider], 35, [1,35], 1, 3>; +defm : SLMWriteResPair<WriteFSqrt64X, [SLM_FPC_RSV0,SLMFPDivider], 71, [1,70], 1, 3>; +defm : SLMWriteResPair<WriteFSqrt64Y, [SLM_FPC_RSV0,SLMFPDivider], 71, [1,70], 1, 3>; +defm : X86WriteResPairUnsupported<WriteFSqrt64Z>; +defm : SLMWriteResPair<WriteFSqrt80, [SLM_FPC_RSV0,SLMFPDivider], 40, [1,40]>; +defm : SLMWriteResPair<WriteDPPD, [SLM_FPC_RSV1], 3>; +defm : SLMWriteResPair<WriteDPPS, [SLM_FPC_RSV1], 3>; +defm : SLMWriteResPair<WriteDPPSY, [SLM_FPC_RSV1], 3>; +defm : X86WriteResPairUnsupported<WriteDPPSZ>; +defm : SLMWriteResPair<WriteFSign, [SLM_FPC_RSV01], 1>; +defm : SLMWriteResPair<WriteFRnd, [SLM_FPC_RSV1], 3>; +defm : SLMWriteResPair<WriteFRndY, [SLM_FPC_RSV1], 3>; +defm : X86WriteResPairUnsupported<WriteFRndZ>; +defm : SLMWriteResPair<WriteFLogic, [SLM_FPC_RSV01], 1>; +defm : SLMWriteResPair<WriteFLogicY, [SLM_FPC_RSV01], 1>; +defm : X86WriteResPairUnsupported<WriteFLogicZ>; +defm : SLMWriteResPair<WriteFTest, [SLM_FPC_RSV01], 1>; +defm : SLMWriteResPair<WriteFTestY, [SLM_FPC_RSV01], 1>; +defm : X86WriteResPairUnsupported<WriteFTestZ>; +defm : SLMWriteResPair<WriteFShuffle, [SLM_FPC_RSV0], 1>; +defm : SLMWriteResPair<WriteFShuffleY, [SLM_FPC_RSV0], 1>; +defm : X86WriteResPairUnsupported<WriteFShuffleZ>; +defm : SLMWriteResPair<WriteFVarShuffle, [SLM_FPC_RSV0], 1>; +defm : SLMWriteResPair<WriteFVarShuffleY,[SLM_FPC_RSV0], 1>; +defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>; +defm : SLMWriteResPair<WriteFBlend, [SLM_FPC_RSV0], 1>; + +// Conversion between integer and float. +defm : SLMWriteResPair<WriteCvtSS2I, [SLM_FPC_RSV01], 4>; +defm : SLMWriteResPair<WriteCvtPS2I, [SLM_FPC_RSV01], 4>; +defm : SLMWriteResPair<WriteCvtPS2IY, [SLM_FPC_RSV01], 4>; +defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>; +defm : SLMWriteResPair<WriteCvtSD2I, [SLM_FPC_RSV01], 4>; +defm : SLMWriteResPair<WriteCvtPD2I, [SLM_FPC_RSV01], 4>; +defm : SLMWriteResPair<WriteCvtPD2IY, [SLM_FPC_RSV01], 4>; +defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>; + +defm : SLMWriteResPair<WriteCvtI2SS, [SLM_FPC_RSV01], 4>; +defm : SLMWriteResPair<WriteCvtI2PS, [SLM_FPC_RSV01], 4>; +defm : SLMWriteResPair<WriteCvtI2PSY, [SLM_FPC_RSV01], 4>; +defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>; +defm : SLMWriteResPair<WriteCvtI2SD, [SLM_FPC_RSV01], 4>; +defm : SLMWriteResPair<WriteCvtI2PD, [SLM_FPC_RSV01], 4>; +defm : SLMWriteResPair<WriteCvtI2PDY, [SLM_FPC_RSV01], 4>; +defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>; + +defm : SLMWriteResPair<WriteCvtSS2SD, [SLM_FPC_RSV01], 4>; +defm : SLMWriteResPair<WriteCvtPS2PD, [SLM_FPC_RSV01], 4>; +defm : SLMWriteResPair<WriteCvtPS2PDY, [SLM_FPC_RSV01], 4>; +defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>; +defm : SLMWriteResPair<WriteCvtSD2SS, [SLM_FPC_RSV01], 4>; +defm : SLMWriteResPair<WriteCvtPD2PS, [SLM_FPC_RSV01], 4>; +defm : SLMWriteResPair<WriteCvtPD2PSY, [SLM_FPC_RSV01], 4>; +defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>; + +// Vector integer operations. +def : WriteRes<WriteVecLoad, [SLM_MEC_RSV]> { let Latency = 3; } +def : WriteRes<WriteVecLoadX, [SLM_MEC_RSV]> { let Latency = 3; } +def : WriteRes<WriteVecLoadY, [SLM_MEC_RSV]> { let Latency = 3; } +def : WriteRes<WriteVecLoadNT, [SLM_MEC_RSV]> { let Latency = 3; } +def : WriteRes<WriteVecLoadNTY, [SLM_MEC_RSV]> { let Latency = 3; } +def : WriteRes<WriteVecMaskedLoad, [SLM_MEC_RSV]> { let Latency = 3; } +def : WriteRes<WriteVecMaskedLoadY, [SLM_MEC_RSV]> { let Latency = 3; } +def : WriteRes<WriteVecStore, [SLM_MEC_RSV]>; +def : WriteRes<WriteVecStoreX, [SLM_MEC_RSV]>; +def : WriteRes<WriteVecStoreY, [SLM_MEC_RSV]>; +def : WriteRes<WriteVecStoreNT, [SLM_MEC_RSV]>; +def : WriteRes<WriteVecStoreNTY, [SLM_MEC_RSV]>; +def : WriteRes<WriteVecMaskedStore, [SLM_MEC_RSV]>; +def : WriteRes<WriteVecMaskedStoreY, [SLM_MEC_RSV]>; +def : WriteRes<WriteVecMove, [SLM_FPC_RSV01]>; +def : WriteRes<WriteVecMoveX, [SLM_FPC_RSV01]>; +def : WriteRes<WriteVecMoveY, [SLM_FPC_RSV01]>; +def : WriteRes<WriteVecMoveToGpr, [SLM_IEC_RSV01]>; +def : WriteRes<WriteVecMoveFromGpr, [SLM_IEC_RSV01]>; + +defm : SLMWriteResPair<WriteVecShift, [SLM_FPC_RSV0], 1>; +defm : SLMWriteResPair<WriteVecShiftX, [SLM_FPC_RSV0], 1>; +defm : SLMWriteResPair<WriteVecShiftY, [SLM_FPC_RSV0], 1>; +defm : X86WriteResPairUnsupported<WriteVecShiftZ>; +defm : SLMWriteResPair<WriteVecShiftImm, [SLM_FPC_RSV0], 1>; +defm : SLMWriteResPair<WriteVecShiftImmX,[SLM_FPC_RSV0], 1>; +defm : SLMWriteResPair<WriteVecShiftImmY,[SLM_FPC_RSV0], 1>; +defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>; +defm : SLMWriteResPair<WriteVecLogic, [SLM_FPC_RSV01], 1>; +defm : SLMWriteResPair<WriteVecLogicX,[SLM_FPC_RSV01], 1>; +defm : SLMWriteResPair<WriteVecLogicY,[SLM_FPC_RSV01], 1>; +defm : X86WriteResPairUnsupported<WriteVecLogicZ>; +defm : SLMWriteResPair<WriteVecTest, [SLM_FPC_RSV01], 1>; +defm : SLMWriteResPair<WriteVecTestY, [SLM_FPC_RSV01], 1>; +defm : X86WriteResPairUnsupported<WriteVecTestZ>; +defm : SLMWriteResPair<WriteVecALU, [SLM_FPC_RSV01], 1>; +defm : SLMWriteResPair<WriteVecALUX, [SLM_FPC_RSV01], 1>; +defm : SLMWriteResPair<WriteVecALUY, [SLM_FPC_RSV01], 1>; +defm : X86WriteResPairUnsupported<WriteVecALUZ>; +defm : SLMWriteResPair<WriteVecIMul, [SLM_FPC_RSV0], 4>; +defm : SLMWriteResPair<WriteVecIMulX, [SLM_FPC_RSV0], 4>; +defm : SLMWriteResPair<WriteVecIMulY, [SLM_FPC_RSV0], 4>; +defm : X86WriteResPairUnsupported<WriteVecIMulZ>; +// FIXME: The below is closer to correct, but caused some perf regressions. +//defm : SLMWriteResPair<WritePMULLD, [SLM_FPC_RSV0], 11, [11], 7>; +defm : SLMWriteResPair<WritePMULLD, [SLM_FPC_RSV0], 4>; +defm : SLMWriteResPair<WritePMULLDY, [SLM_FPC_RSV0], 4>; +defm : X86WriteResPairUnsupported<WritePMULLDZ>; +defm : SLMWriteResPair<WriteShuffle, [SLM_FPC_RSV0], 1>; +defm : SLMWriteResPair<WriteShuffleY, [SLM_FPC_RSV0], 1>; +defm : X86WriteResPairUnsupported<WriteShuffleZ>; +defm : SLMWriteResPair<WriteShuffleX, [SLM_FPC_RSV0], 1>; +defm : SLMWriteResPair<WriteVarShuffle, [SLM_FPC_RSV0], 1>; +defm : SLMWriteResPair<WriteVarShuffleX, [SLM_FPC_RSV0], 1>; +defm : SLMWriteResPair<WriteVarShuffleY, [SLM_FPC_RSV0], 1>; +defm : X86WriteResPairUnsupported<WriteVarShuffleZ>; +defm : SLMWriteResPair<WriteBlend, [SLM_FPC_RSV0], 1>; +defm : SLMWriteResPair<WriteBlendY, [SLM_FPC_RSV0], 1>; +defm : X86WriteResPairUnsupported<WriteBlendZ>; +defm : SLMWriteResPair<WriteMPSAD, [SLM_FPC_RSV0], 7>; +defm : SLMWriteResPair<WriteMPSADY, [SLM_FPC_RSV0], 7>; +defm : X86WriteResPairUnsupported<WriteMPSADZ>; +defm : SLMWriteResPair<WritePSADBW, [SLM_FPC_RSV0], 4>; +defm : SLMWriteResPair<WritePSADBWX, [SLM_FPC_RSV0], 4>; +defm : SLMWriteResPair<WritePSADBWY, [SLM_FPC_RSV0], 4>; +defm : X86WriteResPairUnsupported<WritePSADBWZ>; +defm : SLMWriteResPair<WritePHMINPOS, [SLM_FPC_RSV0], 4>; + +// Vector insert/extract operations. +defm : SLMWriteResPair<WriteVecInsert, [SLM_FPC_RSV0], 1>; + +def : WriteRes<WriteVecExtract, [SLM_FPC_RSV0]>; +def : WriteRes<WriteVecExtractSt, [SLM_FPC_RSV0, SLM_MEC_RSV]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1, 2]; +} + +//////////////////////////////////////////////////////////////////////////////// +// Horizontal add/sub instructions. +//////////////////////////////////////////////////////////////////////////////// + +defm : SLMWriteResPair<WriteFHAdd, [SLM_FPC_RSV01], 3, [2]>; +defm : SLMWriteResPair<WriteFHAddY, [SLM_FPC_RSV01], 3, [2]>; +defm : X86WriteResPairUnsupported<WriteFHAddZ>; +defm : SLMWriteResPair<WritePHAdd, [SLM_FPC_RSV01], 1>; +defm : SLMWriteResPair<WritePHAddX, [SLM_FPC_RSV01], 1>; +defm : SLMWriteResPair<WritePHAddY, [SLM_FPC_RSV01], 1>; +defm : X86WriteResPairUnsupported<WritePHAddZ>; + +// String instructions. +// Packed Compare Implicit Length Strings, Return Mask +def : WriteRes<WritePCmpIStrM, [SLM_FPC_RSV0]> { + let Latency = 13; + let ResourceCycles = [13]; +} +def : WriteRes<WritePCmpIStrMLd, [SLM_FPC_RSV0, SLM_MEC_RSV]> { + let Latency = 13; + let ResourceCycles = [13, 1]; +} + +// Packed Compare Explicit Length Strings, Return Mask +def : WriteRes<WritePCmpEStrM, [SLM_FPC_RSV0]> { + let Latency = 17; + let ResourceCycles = [17]; +} +def : WriteRes<WritePCmpEStrMLd, [SLM_FPC_RSV0, SLM_MEC_RSV]> { + let Latency = 17; + let ResourceCycles = [17, 1]; +} + +// Packed Compare Implicit Length Strings, Return Index +def : WriteRes<WritePCmpIStrI, [SLM_FPC_RSV0]> { + let Latency = 17; + let ResourceCycles = [17]; +} +def : WriteRes<WritePCmpIStrILd, [SLM_FPC_RSV0, SLM_MEC_RSV]> { + let Latency = 17; + let ResourceCycles = [17, 1]; +} + +// Packed Compare Explicit Length Strings, Return Index +def : WriteRes<WritePCmpEStrI, [SLM_FPC_RSV0]> { + let Latency = 21; + let ResourceCycles = [21]; +} +def : WriteRes<WritePCmpEStrILd, [SLM_FPC_RSV0, SLM_MEC_RSV]> { + let Latency = 21; + let ResourceCycles = [21, 1]; +} + +// MOVMSK Instructions. +def : WriteRes<WriteFMOVMSK, [SLM_FPC_RSV1]> { let Latency = 4; } +def : WriteRes<WriteVecMOVMSK, [SLM_FPC_RSV1]> { let Latency = 4; } +def : WriteRes<WriteVecMOVMSKY, [SLM_FPC_RSV1]> { let Latency = 4; } +def : WriteRes<WriteMMXMOVMSK, [SLM_FPC_RSV1]> { let Latency = 4; } + +// AES Instructions. +def : WriteRes<WriteAESDecEnc, [SLM_FPC_RSV0]> { + let Latency = 8; + let ResourceCycles = [5]; +} +def : WriteRes<WriteAESDecEncLd, [SLM_FPC_RSV0, SLM_MEC_RSV]> { + let Latency = 8; + let ResourceCycles = [5, 1]; +} + +def : WriteRes<WriteAESIMC, [SLM_FPC_RSV0]> { + let Latency = 8; + let ResourceCycles = [5]; +} +def : WriteRes<WriteAESIMCLd, [SLM_FPC_RSV0, SLM_MEC_RSV]> { + let Latency = 8; + let ResourceCycles = [5, 1]; +} + +def : WriteRes<WriteAESKeyGen, [SLM_FPC_RSV0]> { + let Latency = 8; + let ResourceCycles = [5]; +} +def : WriteRes<WriteAESKeyGenLd, [SLM_FPC_RSV0, SLM_MEC_RSV]> { + let Latency = 8; + let ResourceCycles = [5, 1]; +} + +// Carry-less multiplication instructions. +def : WriteRes<WriteCLMul, [SLM_FPC_RSV0]> { + let Latency = 10; + let ResourceCycles = [10]; +} +def : WriteRes<WriteCLMulLd, [SLM_FPC_RSV0, SLM_MEC_RSV]> { + let Latency = 10; + let ResourceCycles = [10, 1]; +} + +def : WriteRes<WriteSystem, [SLM_FPC_RSV0]> { let Latency = 100; } +def : WriteRes<WriteMicrocoded, [SLM_FPC_RSV0]> { let Latency = 100; } +def : WriteRes<WriteFence, [SLM_MEC_RSV]>; +def : WriteRes<WriteNop, []>; + +// AVX/FMA is not supported on that architecture, but we should define the basic +// scheduling resources anyway. +def : WriteRes<WriteIMulH, [SLM_FPC_RSV0]>; +defm : X86WriteResPairUnsupported<WriteFBlendY>; +defm : X86WriteResPairUnsupported<WriteFBlendZ>; +defm : SLMWriteResPair<WriteVarBlend, [SLM_FPC_RSV0], 1>; +defm : X86WriteResPairUnsupported<WriteVarBlendY>; +defm : X86WriteResPairUnsupported<WriteVarBlendZ>; +defm : SLMWriteResPair<WriteFVarBlend, [SLM_FPC_RSV0], 1>; +defm : X86WriteResPairUnsupported<WriteFVarBlendY>; +defm : X86WriteResPairUnsupported<WriteFVarBlendZ>; +defm : X86WriteResPairUnsupported<WriteFShuffle256>; +defm : X86WriteResPairUnsupported<WriteFVarShuffle256>; +defm : X86WriteResPairUnsupported<WriteShuffle256>; +defm : X86WriteResPairUnsupported<WriteVarShuffle256>; +defm : SLMWriteResPair<WriteVarVecShift, [SLM_FPC_RSV0], 1>; +defm : X86WriteResPairUnsupported<WriteVarVecShiftY>; +defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>; +defm : X86WriteResPairUnsupported<WriteFMA>; +defm : X86WriteResPairUnsupported<WriteFMAX>; +defm : X86WriteResPairUnsupported<WriteFMAY>; +defm : X86WriteResPairUnsupported<WriteFMAZ>; + +defm : X86WriteResPairUnsupported<WriteCvtPH2PS>; +defm : X86WriteResPairUnsupported<WriteCvtPH2PSY>; +defm : X86WriteResPairUnsupported<WriteCvtPH2PSZ>; +defm : X86WriteResUnsupported<WriteCvtPS2PH>; +defm : X86WriteResUnsupported<WriteCvtPS2PHY>; +defm : X86WriteResUnsupported<WriteCvtPS2PHZ>; +defm : X86WriteResUnsupported<WriteCvtPS2PHSt>; +defm : X86WriteResUnsupported<WriteCvtPS2PHYSt>; +defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>; + +} // SchedModel diff --git a/capstone/suite/synctools/tablegen/X86/back/X86ScheduleZnver1.td b/capstone/suite/synctools/tablegen/X86/back/X86ScheduleZnver1.td new file mode 100644 index 000000000..7184b850a --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86ScheduleZnver1.td @@ -0,0 +1,1544 @@ +//=- X86ScheduleZnver1.td - X86 Znver1 Scheduling -------------*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for Znver1 to support instruction +// scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +def Znver1Model : SchedMachineModel { + // Zen can decode 4 instructions per cycle. + let IssueWidth = 4; + // Based on the reorder buffer we define MicroOpBufferSize + let MicroOpBufferSize = 192; + let LoadLatency = 4; + let MispredictPenalty = 17; + let HighLatency = 25; + let PostRAScheduler = 1; + + // FIXME: This variable is required for incomplete model. + // We haven't catered all instructions. + // So, we reset the value of this variable so as to + // say that the model is incomplete. + let CompleteModel = 0; +} + +let SchedModel = Znver1Model in { + +// Zen can issue micro-ops to 10 different units in one cycle. +// These are +// * Four integer ALU units (ZALU0, ZALU1, ZALU2, ZALU3) +// * Two AGU units (ZAGU0, ZAGU1) +// * Four FPU units (ZFPU0, ZFPU1, ZFPU2, ZFPU3) +// AGUs feed load store queues @two loads and 1 store per cycle. + +// Four ALU units are defined below +def ZnALU0 : ProcResource<1>; +def ZnALU1 : ProcResource<1>; +def ZnALU2 : ProcResource<1>; +def ZnALU3 : ProcResource<1>; + +// Two AGU units are defined below +def ZnAGU0 : ProcResource<1>; +def ZnAGU1 : ProcResource<1>; + +// Four FPU units are defined below +def ZnFPU0 : ProcResource<1>; +def ZnFPU1 : ProcResource<1>; +def ZnFPU2 : ProcResource<1>; +def ZnFPU3 : ProcResource<1>; + +// FPU grouping +def ZnFPU013 : ProcResGroup<[ZnFPU0, ZnFPU1, ZnFPU3]>; +def ZnFPU01 : ProcResGroup<[ZnFPU0, ZnFPU1]>; +def ZnFPU12 : ProcResGroup<[ZnFPU1, ZnFPU2]>; +def ZnFPU13 : ProcResGroup<[ZnFPU1, ZnFPU3]>; +def ZnFPU23 : ProcResGroup<[ZnFPU2, ZnFPU3]>; +def ZnFPU02 : ProcResGroup<[ZnFPU0, ZnFPU2]>; +def ZnFPU03 : ProcResGroup<[ZnFPU0, ZnFPU3]>; + +// Below are the grouping of the units. +// Micro-ops to be issued to multiple units are tackled this way. + +// ALU grouping +// ZnALU03 - 0,3 grouping +def ZnALU03: ProcResGroup<[ZnALU0, ZnALU3]>; + +// 56 Entry (14x4 entries) Int Scheduler +def ZnALU : ProcResGroup<[ZnALU0, ZnALU1, ZnALU2, ZnALU3]> { + let BufferSize=56; +} + +// 28 Entry (14x2) AGU group. AGUs can't be used for all ALU operations +// but are relevant for some instructions +def ZnAGU : ProcResGroup<[ZnAGU0, ZnAGU1]> { + let BufferSize=28; +} + +// Integer Multiplication issued on ALU1. +def ZnMultiplier : ProcResource<1>; + +// Integer division issued on ALU2. +def ZnDivider : ProcResource<1>; + +// 4 Cycles load-to use Latency is captured +def : ReadAdvance<ReadAfterLd, 4>; + +// The Integer PRF for Zen is 168 entries, and it holds the architectural and +// speculative version of the 64-bit integer registers. +// Reference: "Software Optimization Guide for AMD Family 17h Processors" +def ZnIntegerPRF : RegisterFile<168, [GR64, CCR]>; + +// 36 Entry (9x4 entries) floating-point Scheduler +def ZnFPU : ProcResGroup<[ZnFPU0, ZnFPU1, ZnFPU2, ZnFPU3]> { +let BufferSize=36; +} + +// The Zen FP Retire Queue renames SIMD and FP uOps onto a pool of 160 128-bit +// registers. Operations on 256-bit data types are cracked into two COPs. +// Reference: "Software Optimization Guide for AMD Family 17h Processors" +def ZnFpuPRF: RegisterFile<160, [VR64, VR128, VR256], [1, 1, 2]>; + +// The unit can track up to 192 macro ops in-flight. +// The retire unit handles in-order commit of up to 8 macro ops per cycle. +// Reference: "Software Optimization Guide for AMD Family 17h Processors" +// To be noted, the retire unit is shared between integer and FP ops. +// In SMT mode it is 96 entry per thread. But, we do not use the conservative +// value here because there is currently no way to fully mode the SMT mode, +// so there is no point in trying. +def ZnRCU : RetireControlUnit<192, 8>; + +// FIXME: there are 72 read buffers and 44 write buffers. + +// (a folded load is an instruction that loads and does some operation) +// Ex: ADDPD xmm,[mem]-> This instruction has two micro-ops +// Instructions with folded loads are usually micro-fused, so they only appear +// as two micro-ops. +// a. load and +// b. addpd +// This multiclass is for folded loads for integer units. +multiclass ZnWriteResPair<X86FoldableSchedWrite SchedRW, + list<ProcResourceKind> ExePorts, + int Lat, list<int> Res = [], int UOps = 1, + int LoadLat = 4, int LoadUOps = 1> { + // Register variant takes 1-cycle on Execution Port. + def : WriteRes<SchedRW, ExePorts> { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } + + // Memory variant also uses a cycle on ZnAGU + // adds LoadLat cycles to the latency (default = 4). + def : WriteRes<SchedRW.Folded, !listconcat([ZnAGU], ExePorts)> { + let Latency = !add(Lat, LoadLat); + let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res)); + let NumMicroOps = !add(UOps, LoadUOps); + } +} + +// This multiclass is for folded loads for floating point units. +multiclass ZnWriteResFpuPair<X86FoldableSchedWrite SchedRW, + list<ProcResourceKind> ExePorts, + int Lat, list<int> Res = [], int UOps = 1, + int LoadLat = 7, int LoadUOps = 0> { + // Register variant takes 1-cycle on Execution Port. + def : WriteRes<SchedRW, ExePorts> { + let Latency = Lat; + let ResourceCycles = Res; + let NumMicroOps = UOps; + } + + // Memory variant also uses a cycle on ZnAGU + // adds LoadLat cycles to the latency (default = 7). + def : WriteRes<SchedRW.Folded, !listconcat([ZnAGU], ExePorts)> { + let Latency = !add(Lat, LoadLat); + let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res)); + let NumMicroOps = !add(UOps, LoadUOps); + } +} + +// WriteRMW is set for instructions with Memory write +// operation in codegen +def : WriteRes<WriteRMW, [ZnAGU]>; + +def : WriteRes<WriteStore, [ZnAGU]>; +def : WriteRes<WriteStoreNT, [ZnAGU]>; +def : WriteRes<WriteMove, [ZnALU]>; +def : WriteRes<WriteLoad, [ZnAGU]> { let Latency = 8; } + +def : WriteRes<WriteZero, []>; +def : WriteRes<WriteLEA, [ZnALU]>; +defm : ZnWriteResPair<WriteALU, [ZnALU], 1>; +defm : ZnWriteResPair<WriteADC, [ZnALU], 1>; +defm : ZnWriteResPair<WriteIMul, [ZnALU1, ZnMultiplier], 4>; +defm : ZnWriteResPair<WriteIMul64, [ZnALU1, ZnMultiplier], 4, [1,1], 2>; + +defm : X86WriteRes<WriteBSWAP32, [ZnALU], 1, [4], 1>; +defm : X86WriteRes<WriteBSWAP64, [ZnALU], 1, [4], 1>; + +defm : ZnWriteResPair<WriteShift, [ZnALU], 1>; + +defm : X86WriteRes<WriteSHDrri, [ZnALU], 1, [1], 1>; +defm : X86WriteResUnsupported<WriteSHDrrcl>; +defm : X86WriteResUnsupported<WriteSHDmri>; +defm : X86WriteResUnsupported<WriteSHDmrcl>; + +defm : ZnWriteResPair<WriteJump, [ZnALU], 1>; +defm : ZnWriteResFpuPair<WriteCRC32, [ZnFPU0], 3>; + +defm : ZnWriteResPair<WriteCMOV, [ZnALU], 1>; +defm : ZnWriteResPair<WriteCMOV2, [ZnALU], 1>; +def : WriteRes<WriteSETCC, [ZnALU]>; +def : WriteRes<WriteSETCCStore, [ZnALU, ZnAGU]>; +defm : X86WriteRes<WriteLAHFSAHF, [ZnALU], 2, [1], 2>; +def : WriteRes<WriteBitTest,[ZnALU]>; + +// Bit counts. +defm : ZnWriteResPair<WriteBSF, [ZnALU], 3>; +defm : ZnWriteResPair<WriteBSR, [ZnALU], 3>; +defm : ZnWriteResPair<WriteLZCNT, [ZnALU], 2>; +defm : ZnWriteResPair<WriteTZCNT, [ZnALU], 2>; +defm : ZnWriteResPair<WritePOPCNT, [ZnALU], 1>; + +// Treat misc copies as a move. +def : InstRW<[WriteMove], (instrs COPY)>; + +// BMI1 BEXTR, BMI2 BZHI +defm : ZnWriteResPair<WriteBEXTR, [ZnALU], 1>; +defm : ZnWriteResPair<WriteBZHI, [ZnALU], 1>; + +// IDIV +defm : ZnWriteResPair<WriteDiv8, [ZnALU2, ZnDivider], 15, [1,15], 1>; +defm : ZnWriteResPair<WriteDiv16, [ZnALU2, ZnDivider], 17, [1,17], 2>; +defm : ZnWriteResPair<WriteDiv32, [ZnALU2, ZnDivider], 25, [1,25], 2>; +defm : ZnWriteResPair<WriteDiv64, [ZnALU2, ZnDivider], 41, [1,41], 2>; +defm : ZnWriteResPair<WriteIDiv8, [ZnALU2, ZnDivider], 15, [1,15], 1>; +defm : ZnWriteResPair<WriteIDiv16, [ZnALU2, ZnDivider], 17, [1,17], 2>; +defm : ZnWriteResPair<WriteIDiv32, [ZnALU2, ZnDivider], 25, [1,25], 2>; +defm : ZnWriteResPair<WriteIDiv64, [ZnALU2, ZnDivider], 41, [1,41], 2>; + +// IMULH +def : WriteRes<WriteIMulH, [ZnALU1, ZnMultiplier]>{ + let Latency = 4; +} + +// Floating point operations +defm : X86WriteRes<WriteFLoad, [ZnAGU], 8, [1], 1>; +defm : X86WriteRes<WriteFLoadX, [ZnAGU], 8, [1], 1>; +defm : X86WriteRes<WriteFLoadY, [ZnAGU], 8, [1], 1>; +defm : X86WriteRes<WriteFMaskedLoad, [ZnAGU,ZnFPU01], 8, [1,1], 1>; +defm : X86WriteRes<WriteFMaskedLoadY, [ZnAGU,ZnFPU01], 8, [1,2], 2>; +defm : X86WriteRes<WriteFStore, [ZnAGU], 1, [1], 1>; +defm : X86WriteRes<WriteFStoreX, [ZnAGU], 1, [1], 1>; +defm : X86WriteRes<WriteFStoreY, [ZnAGU], 1, [1], 1>; +defm : X86WriteRes<WriteFStoreNT, [ZnAGU,ZnFPU2], 8, [1,1], 1>; +defm : X86WriteRes<WriteFStoreNTX, [ZnAGU], 1, [1], 1>; +defm : X86WriteRes<WriteFStoreNTY, [ZnAGU], 1, [1], 1>; +defm : X86WriteRes<WriteFMaskedStore, [ZnAGU,ZnFPU01], 4, [1,1], 1>; +defm : X86WriteRes<WriteFMaskedStoreY, [ZnAGU,ZnFPU01], 5, [1,2], 2>; +defm : X86WriteRes<WriteFMove, [ZnFPU], 1, [1], 1>; +defm : X86WriteRes<WriteFMoveX, [ZnFPU], 1, [1], 1>; +defm : X86WriteRes<WriteFMoveY, [ZnFPU], 1, [1], 1>; + +defm : ZnWriteResFpuPair<WriteFAdd, [ZnFPU0], 3>; +defm : ZnWriteResFpuPair<WriteFAddX, [ZnFPU0], 3>; +defm : ZnWriteResFpuPair<WriteFAddY, [ZnFPU0], 3>; +defm : X86WriteResPairUnsupported<WriteFAddZ>; +defm : ZnWriteResFpuPair<WriteFAdd64, [ZnFPU0], 3>; +defm : ZnWriteResFpuPair<WriteFAdd64X, [ZnFPU0], 3>; +defm : ZnWriteResFpuPair<WriteFAdd64Y, [ZnFPU0], 3>; +defm : X86WriteResPairUnsupported<WriteFAdd64Z>; +defm : ZnWriteResFpuPair<WriteFCmp, [ZnFPU0], 3>; +defm : ZnWriteResFpuPair<WriteFCmpX, [ZnFPU0], 3>; +defm : ZnWriteResFpuPair<WriteFCmpY, [ZnFPU0], 3>; +defm : X86WriteResPairUnsupported<WriteFCmpZ>; +defm : ZnWriteResFpuPair<WriteFCmp64, [ZnFPU0], 3>; +defm : ZnWriteResFpuPair<WriteFCmp64X, [ZnFPU0], 3>; +defm : ZnWriteResFpuPair<WriteFCmp64Y, [ZnFPU0], 3>; +defm : X86WriteResPairUnsupported<WriteFCmp64Z>; +defm : ZnWriteResFpuPair<WriteFCom, [ZnFPU0], 3>; +defm : ZnWriteResFpuPair<WriteFBlend, [ZnFPU01], 1>; +defm : ZnWriteResFpuPair<WriteFBlendY, [ZnFPU01], 1>; +defm : X86WriteResPairUnsupported<WriteFBlendZ>; +defm : ZnWriteResFpuPair<WriteFVarBlend, [ZnFPU01], 1>; +defm : ZnWriteResFpuPair<WriteFVarBlendY,[ZnFPU01], 1>; +defm : X86WriteResPairUnsupported<WriteFVarBlendZ>; +defm : ZnWriteResFpuPair<WriteVarBlend, [ZnFPU0], 1>; +defm : ZnWriteResFpuPair<WriteVarBlendY, [ZnFPU0], 1>; +defm : X86WriteResPairUnsupported<WriteVarBlendZ>; +defm : ZnWriteResFpuPair<WriteCvtSS2I, [ZnFPU3], 5>; +defm : ZnWriteResFpuPair<WriteCvtPS2I, [ZnFPU3], 5>; +defm : ZnWriteResFpuPair<WriteCvtPS2IY, [ZnFPU3], 5>; +defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>; +defm : ZnWriteResFpuPair<WriteCvtSD2I, [ZnFPU3], 5>; +defm : ZnWriteResFpuPair<WriteCvtPD2I, [ZnFPU3], 5>; +defm : ZnWriteResFpuPair<WriteCvtPD2IY, [ZnFPU3], 5>; +defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>; +defm : ZnWriteResFpuPair<WriteCvtI2SS, [ZnFPU3], 5>; +defm : ZnWriteResFpuPair<WriteCvtI2PS, [ZnFPU3], 5>; +defm : ZnWriteResFpuPair<WriteCvtI2PSY, [ZnFPU3], 5>; +defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>; +defm : ZnWriteResFpuPair<WriteCvtI2SD, [ZnFPU3], 5>; +defm : ZnWriteResFpuPair<WriteCvtI2PD, [ZnFPU3], 5>; +defm : ZnWriteResFpuPair<WriteCvtI2PDY, [ZnFPU3], 5>; +defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>; +defm : ZnWriteResFpuPair<WriteFDiv, [ZnFPU3], 15>; +defm : ZnWriteResFpuPair<WriteFDivX, [ZnFPU3], 15>; +//defm : ZnWriteResFpuPair<WriteFDivY, [ZnFPU3], 15>; +defm : X86WriteResPairUnsupported<WriteFDivZ>; +defm : ZnWriteResFpuPair<WriteFDiv64, [ZnFPU3], 15>; +defm : ZnWriteResFpuPair<WriteFDiv64X, [ZnFPU3], 15>; +//defm : ZnWriteResFpuPair<WriteFDiv64Y, [ZnFPU3], 15>; +defm : X86WriteResPairUnsupported<WriteFDiv64Z>; +defm : ZnWriteResFpuPair<WriteFSign, [ZnFPU3], 2>; +defm : ZnWriteResFpuPair<WriteFRnd, [ZnFPU3], 4, [1], 1, 7, 1>; // FIXME: Should folds require 1 extra uops? +defm : ZnWriteResFpuPair<WriteFRndY, [ZnFPU3], 4, [1], 1, 7, 1>; // FIXME: Should folds require 1 extra uops? +defm : X86WriteResPairUnsupported<WriteFRndZ>; +defm : ZnWriteResFpuPair<WriteFLogic, [ZnFPU], 1>; +defm : ZnWriteResFpuPair<WriteFLogicY, [ZnFPU], 1>; +defm : X86WriteResPairUnsupported<WriteFLogicZ>; +defm : ZnWriteResFpuPair<WriteFTest, [ZnFPU], 1>; +defm : ZnWriteResFpuPair<WriteFTestY, [ZnFPU], 1>; +defm : X86WriteResPairUnsupported<WriteFTestZ>; +defm : ZnWriteResFpuPair<WriteFShuffle, [ZnFPU12], 1>; +defm : ZnWriteResFpuPair<WriteFShuffleY, [ZnFPU12], 1>; +defm : X86WriteResPairUnsupported<WriteFShuffleZ>; +defm : ZnWriteResFpuPair<WriteFVarShuffle, [ZnFPU12], 1>; +defm : ZnWriteResFpuPair<WriteFVarShuffleY,[ZnFPU12], 1>; +defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>; +defm : ZnWriteResFpuPair<WriteFMul, [ZnFPU01], 3, [1], 1, 7, 1>; +defm : ZnWriteResFpuPair<WriteFMulX, [ZnFPU01], 3, [1], 1, 7, 1>; +defm : ZnWriteResFpuPair<WriteFMulY, [ZnFPU01], 4, [1], 1, 7, 1>; +defm : X86WriteResPairUnsupported<WriteFMulZ>; +defm : ZnWriteResFpuPair<WriteFMul64, [ZnFPU01], 3, [1], 1, 7, 1>; +defm : ZnWriteResFpuPair<WriteFMul64X, [ZnFPU01], 3, [1], 1, 7, 1>; +defm : ZnWriteResFpuPair<WriteFMul64Y, [ZnFPU01], 4, [1], 1, 7, 1>; +defm : X86WriteResPairUnsupported<WriteFMul64Z>; +defm : ZnWriteResFpuPair<WriteFMA, [ZnFPU03], 5>; +defm : ZnWriteResFpuPair<WriteFMAX, [ZnFPU03], 5>; +defm : ZnWriteResFpuPair<WriteFMAY, [ZnFPU03], 5>; +defm : X86WriteResPairUnsupported<WriteFMAZ>; +defm : ZnWriteResFpuPair<WriteFRcp, [ZnFPU01], 5>; +defm : ZnWriteResFpuPair<WriteFRcpX, [ZnFPU01], 5>; +defm : ZnWriteResFpuPair<WriteFRcpY, [ZnFPU01], 5, [1], 1, 7, 2>; +defm : X86WriteResPairUnsupported<WriteFRcpZ>; +//defm : ZnWriteResFpuPair<WriteFRsqrt, [ZnFPU02], 5>; +defm : ZnWriteResFpuPair<WriteFRsqrtX, [ZnFPU01], 5, [1], 1, 7, 1>; +//defm : ZnWriteResFpuPair<WriteFRsqrtY, [ZnFPU01], 5, [2], 2>; +defm : X86WriteResPairUnsupported<WriteFRsqrtZ>; +defm : ZnWriteResFpuPair<WriteFSqrt, [ZnFPU3], 20, [20]>; +defm : ZnWriteResFpuPair<WriteFSqrtX, [ZnFPU3], 20, [20]>; +defm : ZnWriteResFpuPair<WriteFSqrtY, [ZnFPU3], 28, [28], 1, 7, 1>; +defm : X86WriteResPairUnsupported<WriteFSqrtZ>; +defm : ZnWriteResFpuPair<WriteFSqrt64, [ZnFPU3], 20, [20]>; +defm : ZnWriteResFpuPair<WriteFSqrt64X, [ZnFPU3], 20, [20]>; +defm : ZnWriteResFpuPair<WriteFSqrt64Y, [ZnFPU3], 40, [40], 1, 7, 1>; +defm : X86WriteResPairUnsupported<WriteFSqrt64Z>; +defm : ZnWriteResFpuPair<WriteFSqrt80, [ZnFPU3], 20, [20]>; + +// Vector integer operations which uses FPU units +defm : X86WriteRes<WriteVecLoad, [ZnAGU], 8, [1], 1>; +defm : X86WriteRes<WriteVecLoadX, [ZnAGU], 8, [1], 1>; +defm : X86WriteRes<WriteVecLoadY, [ZnAGU], 8, [1], 1>; +defm : X86WriteRes<WriteVecLoadNT, [ZnAGU], 8, [1], 1>; +defm : X86WriteRes<WriteVecLoadNTY, [ZnAGU], 8, [1], 1>; +defm : X86WriteRes<WriteVecMaskedLoad, [ZnAGU,ZnFPU01], 8, [1,2], 2>; +defm : X86WriteRes<WriteVecMaskedLoadY, [ZnAGU,ZnFPU01], 9, [1,3], 2>; +defm : X86WriteRes<WriteVecStore, [ZnAGU], 1, [1], 1>; +defm : X86WriteRes<WriteVecStoreX, [ZnAGU], 1, [1], 1>; +defm : X86WriteRes<WriteVecStoreY, [ZnAGU], 1, [1], 1>; +defm : X86WriteRes<WriteVecStoreNT, [ZnAGU], 1, [1], 1>; +defm : X86WriteRes<WriteVecStoreNTY, [ZnAGU], 1, [1], 1>; +defm : X86WriteRes<WriteVecMaskedStore, [ZnAGU,ZnFPU01], 4, [1,1], 1>; +defm : X86WriteRes<WriteVecMaskedStoreY, [ZnAGU,ZnFPU01], 5, [1,2], 2>; +defm : X86WriteRes<WriteVecMove, [ZnFPU], 1, [1], 1>; +defm : X86WriteRes<WriteVecMoveX, [ZnFPU], 1, [1], 1>; +defm : X86WriteRes<WriteVecMoveY, [ZnFPU], 2, [1], 2>; +defm : X86WriteRes<WriteVecMoveToGpr, [ZnFPU2], 2, [1], 1>; +defm : X86WriteRes<WriteVecMoveFromGpr, [ZnFPU2], 3, [1], 1>; +defm : X86WriteRes<WriteEMMS, [ZnFPU], 2, [1], 1>; + +defm : ZnWriteResFpuPair<WriteVecShift, [ZnFPU], 1>; +defm : ZnWriteResFpuPair<WriteVecShiftX, [ZnFPU2], 1>; +defm : ZnWriteResFpuPair<WriteVecShiftY, [ZnFPU2], 2>; +defm : X86WriteResPairUnsupported<WriteVecShiftZ>; +defm : ZnWriteResFpuPair<WriteVecShiftImm, [ZnFPU], 1>; +defm : ZnWriteResFpuPair<WriteVecShiftImmX, [ZnFPU], 1>; +defm : ZnWriteResFpuPair<WriteVecShiftImmY, [ZnFPU], 1>; +defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>; +defm : ZnWriteResFpuPair<WriteVecLogic, [ZnFPU], 1>; +defm : ZnWriteResFpuPair<WriteVecLogicX, [ZnFPU], 1>; +defm : ZnWriteResFpuPair<WriteVecLogicY, [ZnFPU], 1>; +defm : X86WriteResPairUnsupported<WriteVecLogicZ>; +defm : ZnWriteResFpuPair<WriteVecTest, [ZnFPU12], 1, [2], 1, 7, 1>; +defm : ZnWriteResFpuPair<WriteVecTestY, [ZnFPU12], 1, [2], 1, 7, 1>; +defm : X86WriteResPairUnsupported<WriteVecTestZ>; +defm : ZnWriteResFpuPair<WriteVecALU, [ZnFPU], 1>; +defm : ZnWriteResFpuPair<WriteVecALUX, [ZnFPU], 1>; +defm : ZnWriteResFpuPair<WriteVecALUY, [ZnFPU], 1>; +defm : X86WriteResPairUnsupported<WriteVecALUZ>; +defm : ZnWriteResFpuPair<WriteVecIMul, [ZnFPU0], 4>; +defm : ZnWriteResFpuPair<WriteVecIMulX, [ZnFPU0], 4>; +defm : ZnWriteResFpuPair<WriteVecIMulY, [ZnFPU0], 4>; +defm : X86WriteResPairUnsupported<WriteVecIMulZ>; +defm : ZnWriteResFpuPair<WritePMULLD, [ZnFPU0], 4, [1], 1, 7, 1>; // FIXME +defm : ZnWriteResFpuPair<WritePMULLDY, [ZnFPU0], 5, [2], 1, 7, 1>; // FIXME +defm : X86WriteResPairUnsupported<WritePMULLDZ>; +defm : ZnWriteResFpuPair<WriteShuffle, [ZnFPU], 1>; +defm : ZnWriteResFpuPair<WriteShuffleX, [ZnFPU], 1>; +defm : ZnWriteResFpuPair<WriteShuffleY, [ZnFPU], 1>; +defm : X86WriteResPairUnsupported<WriteShuffleZ>; +defm : ZnWriteResFpuPair<WriteVarShuffle, [ZnFPU], 1>; +defm : ZnWriteResFpuPair<WriteVarShuffleX,[ZnFPU], 1>; +defm : ZnWriteResFpuPair<WriteVarShuffleY,[ZnFPU], 1>; +defm : X86WriteResPairUnsupported<WriteVarShuffleZ>; +defm : ZnWriteResFpuPair<WriteBlend, [ZnFPU01], 1>; +defm : ZnWriteResFpuPair<WriteBlendY, [ZnFPU01], 1>; +defm : X86WriteResPairUnsupported<WriteBlendZ>; +defm : ZnWriteResFpuPair<WriteShuffle256, [ZnFPU], 2>; +defm : ZnWriteResFpuPair<WriteVarShuffle256, [ZnFPU], 2>; +defm : ZnWriteResFpuPair<WritePSADBW, [ZnFPU0], 3>; +defm : ZnWriteResFpuPair<WritePSADBWX, [ZnFPU0], 3>; +defm : ZnWriteResFpuPair<WritePSADBWY, [ZnFPU0], 3>; +defm : X86WriteResPairUnsupported<WritePSADBWZ>; +defm : ZnWriteResFpuPair<WritePHMINPOS, [ZnFPU0], 4>; + +// Vector Shift Operations +defm : ZnWriteResFpuPair<WriteVarVecShift, [ZnFPU12], 1>; +defm : ZnWriteResFpuPair<WriteVarVecShiftY, [ZnFPU12], 1>; +defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>; + +// Vector insert/extract operations. +defm : ZnWriteResFpuPair<WriteVecInsert, [ZnFPU], 1>; + +def : WriteRes<WriteVecExtract, [ZnFPU12, ZnFPU2]> { + let Latency = 2; + let ResourceCycles = [1, 2]; +} +def : WriteRes<WriteVecExtractSt, [ZnAGU, ZnFPU12, ZnFPU2]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [1, 2, 3]; +} + +// MOVMSK Instructions. +def : WriteRes<WriteFMOVMSK, [ZnFPU2]>; +def : WriteRes<WriteMMXMOVMSK, [ZnFPU2]>; +def : WriteRes<WriteVecMOVMSK, [ZnFPU2]>; + +def : WriteRes<WriteVecMOVMSKY, [ZnFPU2]> { + let NumMicroOps = 2; + let Latency = 2; + let ResourceCycles = [2]; +} + +// AES Instructions. +defm : ZnWriteResFpuPair<WriteAESDecEnc, [ZnFPU01], 4>; +defm : ZnWriteResFpuPair<WriteAESIMC, [ZnFPU01], 4>; +defm : ZnWriteResFpuPair<WriteAESKeyGen, [ZnFPU01], 4>; + +def : WriteRes<WriteFence, [ZnAGU]>; +def : WriteRes<WriteNop, []>; + +// Following instructions with latency=100 are microcoded. +// We set long latency so as to block the entire pipeline. +defm : ZnWriteResFpuPair<WriteFShuffle256, [ZnFPU], 100>; +defm : ZnWriteResFpuPair<WriteFVarShuffle256, [ZnFPU], 100>; + +// Microcoded Instructions +def ZnWriteMicrocoded : SchedWriteRes<[]> { + let Latency = 100; +} + +def : SchedAlias<WriteMicrocoded, ZnWriteMicrocoded>; +def : SchedAlias<WriteFCMOV, ZnWriteMicrocoded>; +def : SchedAlias<WriteSystem, ZnWriteMicrocoded>; +def : SchedAlias<WriteMPSAD, ZnWriteMicrocoded>; +def : SchedAlias<WriteMPSADY, ZnWriteMicrocoded>; +def : SchedAlias<WriteMPSADLd, ZnWriteMicrocoded>; +def : SchedAlias<WriteMPSADYLd, ZnWriteMicrocoded>; +def : SchedAlias<WriteCLMul, ZnWriteMicrocoded>; +def : SchedAlias<WriteCLMulLd, ZnWriteMicrocoded>; +def : SchedAlias<WritePCmpIStrM, ZnWriteMicrocoded>; +def : SchedAlias<WritePCmpIStrMLd, ZnWriteMicrocoded>; +def : SchedAlias<WritePCmpEStrI, ZnWriteMicrocoded>; +def : SchedAlias<WritePCmpEStrILd, ZnWriteMicrocoded>; +def : SchedAlias<WritePCmpEStrM, ZnWriteMicrocoded>; +def : SchedAlias<WritePCmpEStrMLd, ZnWriteMicrocoded>; +def : SchedAlias<WritePCmpIStrI, ZnWriteMicrocoded>; +def : SchedAlias<WritePCmpIStrILd, ZnWriteMicrocoded>; +def : SchedAlias<WriteLDMXCSR, ZnWriteMicrocoded>; +def : SchedAlias<WriteSTMXCSR, ZnWriteMicrocoded>; + +//=== Regex based InstRW ===// +// Notation: +// - r: register. +// - m = memory. +// - i = immediate +// - mm: 64 bit mmx register. +// - x = 128 bit xmm register. +// - (x)mm = mmx or xmm register. +// - y = 256 bit ymm register. +// - v = any vector register. + +//=== Integer Instructions ===// +//-- Move instructions --// +// MOV. +// r16,m. +def : InstRW<[WriteALULd, ReadAfterLd], (instregex "MOV16rm")>; + +// MOVSX, MOVZX. +// r,m. +def : InstRW<[WriteLoad], (instregex "MOV(S|Z)X32rm(8|16)")>; + +// XCHG. +// r,r. +def ZnWriteXCHG : SchedWriteRes<[ZnALU]> { + let NumMicroOps = 2; + let ResourceCycles = [2]; +} + +def : InstRW<[ZnWriteXCHG], (instregex "XCHG(8|16|32|64)rr", "XCHG(16|32|64)ar")>; + +// r,m. +def ZnWriteXCHGrm : SchedWriteRes<[ZnAGU, ZnALU]> { + let Latency = 5; + let NumMicroOps = 2; +} +def : InstRW<[ZnWriteXCHGrm, ReadAfterLd], (instregex "XCHG(8|16|32|64)rm")>; + +def : InstRW<[WriteMicrocoded], (instrs XLAT)>; + +// POP16. +// r. +def ZnWritePop16r : SchedWriteRes<[ZnAGU]>{ + let Latency = 5; + let NumMicroOps = 2; +} +def : InstRW<[ZnWritePop16r], (instregex "POP16rmm")>; +def : InstRW<[WriteMicrocoded], (instregex "POPF(16|32)")>; +def : InstRW<[WriteMicrocoded], (instregex "POPA(16|32)")>; + + +// PUSH. +// r. Has default values. +// m. +def ZnWritePUSH : SchedWriteRes<[ZnAGU]>{ + let Latency = 4; +} +def : InstRW<[ZnWritePUSH], (instregex "PUSH(16|32)rmm")>; + +//PUSHF +def : InstRW<[WriteMicrocoded], (instregex "PUSHF(16|32)")>; + +// PUSHA. +def ZnWritePushA : SchedWriteRes<[ZnAGU]> { + let Latency = 8; +} +def : InstRW<[ZnWritePushA], (instregex "PUSHA(16|32)")>; + +//LAHF +def : InstRW<[WriteMicrocoded], (instrs LAHF)>; + +// MOVBE. +// r,m. +def ZnWriteMOVBE : SchedWriteRes<[ZnAGU, ZnALU]> { + let Latency = 5; +} +def : InstRW<[ZnWriteMOVBE, ReadAfterLd], (instregex "MOVBE(16|32|64)rm")>; + +// m16,r16. +def : InstRW<[ZnWriteMOVBE], (instregex "MOVBE(16|32|64)mr")>; + +//-- Arithmetic instructions --// + +// ADD SUB. +// m,r/i. +def : InstRW<[WriteALULd], (instregex "(ADD|SUB)(8|16|32|64)m(r|i)", + "(ADD|SUB)(8|16|32|64)mi8", + "(ADD|SUB)64mi32")>; + +// ADC SBB. +// m,r/i. +def : InstRW<[WriteALULd], + (instregex "(ADC|SBB)(8|16|32|64)m(r|i)", + "(ADC|SBB)(16|32|64)mi8", + "(ADC|SBB)64mi32")>; + +// INC DEC NOT NEG. +// m. +def : InstRW<[WriteALULd], + (instregex "(INC|DEC|NOT|NEG)(8|16|32|64)m")>; + +// MUL IMUL. +// r16. +def ZnWriteMul16 : SchedWriteRes<[ZnALU1, ZnMultiplier]> { + let Latency = 3; +} +def : InstRW<[ZnWriteMul16], (instrs IMUL16r, MUL16r)>; +def : InstRW<[ZnWriteMul16], (instrs IMUL16rr, IMUL16rri, IMUL16rri8)>; // TODO: is this right? +def : InstRW<[ZnWriteMul16], (instrs IMUL16rm, IMUL16rmi, IMUL16rmi8)>; // TODO: this is definitely wrong but matches what the instregex did. + +// m16. +def ZnWriteMul16Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> { + let Latency = 8; +} +def : InstRW<[ZnWriteMul16Ld, ReadAfterLd], (instrs IMUL16m, MUL16m)>; + +// r32. +def ZnWriteMul32 : SchedWriteRes<[ZnALU1, ZnMultiplier]> { + let Latency = 3; +} +def : InstRW<[ZnWriteMul32], (instrs IMUL32r, MUL32r)>; +def : InstRW<[ZnWriteMul32], (instrs IMUL32rr, IMUL32rri, IMUL32rri8)>; // TODO: is this right? +def : InstRW<[ZnWriteMul32], (instrs IMUL32rm, IMUL32rmi, IMUL32rmi8)>; // TODO: this is definitely wrong but matches what the instregex did. + +// m32. +def ZnWriteMul32Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> { + let Latency = 8; +} +def : InstRW<[ZnWriteMul32Ld, ReadAfterLd], (instrs IMUL32m, MUL32m)>; + +// r64. +def ZnWriteMul64 : SchedWriteRes<[ZnALU1, ZnMultiplier]> { + let Latency = 4; + let NumMicroOps = 2; +} +def : InstRW<[ZnWriteMul64], (instrs IMUL64r, MUL64r)>; +def : InstRW<[ZnWriteMul64], (instrs IMUL64rr, IMUL64rri8, IMUL64rri32)>; // TODO: is this right? +def : InstRW<[ZnWriteMul64], (instrs IMUL64rm, IMUL64rmi32, IMUL64rmi8)>; // TODO: this is definitely wrong but matches what the instregex did. + +// m64. +def ZnWriteMul64Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> { + let Latency = 9; + let NumMicroOps = 2; +} +def : InstRW<[ZnWriteMul64Ld, ReadAfterLd], (instrs IMUL64m, MUL64m)>; + +// MULX. +// r32,r32,r32. +def ZnWriteMulX32 : SchedWriteRes<[ZnALU1, ZnMultiplier]> { + let Latency = 3; + let ResourceCycles = [1, 2]; +} +def : InstRW<[ZnWriteMulX32], (instrs MULX32rr)>; + +// r32,r32,m32. +def ZnWriteMulX32Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> { + let Latency = 8; + let ResourceCycles = [1, 2, 2]; +} +def : InstRW<[ZnWriteMulX32Ld, ReadAfterLd], (instrs MULX32rm)>; + +// r64,r64,r64. +def ZnWriteMulX64 : SchedWriteRes<[ZnALU1]> { + let Latency = 3; +} +def : InstRW<[ZnWriteMulX64], (instrs MULX64rr)>; + +// r64,r64,m64. +def ZnWriteMulX64Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> { + let Latency = 8; +} +def : InstRW<[ZnWriteMulX64Ld, ReadAfterLd], (instrs MULX64rm)>; + +//-- Control transfer instructions --// + +// J(E|R)CXZ. +def ZnWriteJCXZ : SchedWriteRes<[ZnALU03]>; +def : InstRW<[ZnWriteJCXZ], (instrs JCXZ, JECXZ, JRCXZ)>; + +// INTO +def : InstRW<[WriteMicrocoded], (instrs INTO)>; + +// LOOP. +def ZnWriteLOOP : SchedWriteRes<[ZnALU03]>; +def : InstRW<[ZnWriteLOOP], (instrs LOOP)>; + +// LOOP(N)E, LOOP(N)Z +def ZnWriteLOOPE : SchedWriteRes<[ZnALU03]>; +def : InstRW<[ZnWriteLOOPE], (instrs LOOPE, LOOPNE)>; + +// CALL. +// r. +def ZnWriteCALLr : SchedWriteRes<[ZnAGU, ZnALU03]>; +def : InstRW<[ZnWriteCALLr], (instregex "CALL(16|32)r")>; + +def : InstRW<[WriteMicrocoded], (instregex "CALL(16|32)m")>; + +// RET. +def ZnWriteRET : SchedWriteRes<[ZnALU03]> { + let NumMicroOps = 2; +} +def : InstRW<[ZnWriteRET], (instregex "RET(L|Q|W)", "LRET(L|Q|W)", + "IRET(16|32|64)")>; + +//-- Logic instructions --// + +// AND OR XOR. +// m,r/i. +def : InstRW<[WriteALULd], + (instregex "(AND|OR|XOR)(8|16|32|64)m(r|i)", + "(AND|OR|XOR)(8|16|32|64)mi8", "(AND|OR|XOR)64mi32")>; + +// Define ALU latency variants +def ZnWriteALULat2 : SchedWriteRes<[ZnALU]> { + let Latency = 2; +} +def ZnWriteALULat2Ld : SchedWriteRes<[ZnAGU, ZnALU]> { + let Latency = 6; +} + +// BT. +// m,i. +def : InstRW<[WriteShiftLd], (instregex "BT(16|32|64)mi8")>; + +// BTR BTS BTC. +// r,r,i. +def ZnWriteBTRSC : SchedWriteRes<[ZnALU]> { + let Latency = 2; + let NumMicroOps = 2; +} +def : InstRW<[ZnWriteBTRSC], (instregex "BT(R|S|C)(16|32|64)r(r|i8)")>; + +// m,r,i. +def ZnWriteBTRSCm : SchedWriteRes<[ZnAGU, ZnALU]> { + let Latency = 6; + let NumMicroOps = 2; +} +// m,r,i. +def : InstRW<[ZnWriteBTRSCm], (instregex "BT(R|S|C)(16|32|64)m(r|i8)")>; + +// BLSI BLSMSK BLSR. +// r,r. +def : InstRW<[ZnWriteALULat2], (instregex "BLS(I|MSK|R)(32|64)rr")>; +// r,m. +def : InstRW<[ZnWriteALULat2Ld], (instregex "BLS(I|MSK|R)(32|64)rm")>; + +// CLD STD. +def : InstRW<[WriteALU], (instrs STD, CLD)>; + +// PDEP PEXT. +// r,r,r. +def : InstRW<[WriteMicrocoded], (instregex "PDEP(32|64)rr", "PEXT(32|64)rr")>; +// r,r,m. +def : InstRW<[WriteMicrocoded], (instregex "PDEP(32|64)rm", "PEXT(32|64)rm")>; + +// RCR RCL. +// m,i. +def : InstRW<[WriteMicrocoded], (instregex "RC(R|L)(8|16|32|64)m(1|i|CL)")>; + +// SHR SHL SAR. +// m,i. +def : InstRW<[WriteShiftLd], (instregex "S(A|H)(R|L)(8|16|32|64)m(i|1)")>; + +// SHRD SHLD. +// m,r +def : InstRW<[WriteShiftLd], (instregex "SH(R|L)D(16|32|64)mri8")>; + +// r,r,cl. +def : InstRW<[WriteMicrocoded], (instregex "SH(R|L)D(16|32|64)rrCL")>; + +// m,r,cl. +def : InstRW<[WriteMicrocoded], (instregex "SH(R|L)D(16|32|64)mrCL")>; + +//-- Misc instructions --// +// CMPXCHG. +def ZnWriteCMPXCHG : SchedWriteRes<[ZnAGU, ZnALU]> { + let Latency = 8; + let NumMicroOps = 5; +} +def : InstRW<[ZnWriteCMPXCHG], (instregex "CMPXCHG(8|16|32|64)rm")>; + +// CMPXCHG8B. +def ZnWriteCMPXCHG8B : SchedWriteRes<[ZnAGU, ZnALU]> { + let NumMicroOps = 18; +} +def : InstRW<[ZnWriteCMPXCHG8B], (instrs CMPXCHG8B)>; + +def : InstRW<[WriteMicrocoded], (instrs CMPXCHG16B)>; + +// LEAVE +def ZnWriteLEAVE : SchedWriteRes<[ZnALU, ZnAGU]> { + let Latency = 8; + let NumMicroOps = 2; +} +def : InstRW<[ZnWriteLEAVE], (instregex "LEAVE")>; + +// PAUSE. +def : InstRW<[WriteMicrocoded], (instrs PAUSE)>; + +// RDTSC. +def : InstRW<[WriteMicrocoded], (instregex "RDTSC")>; + +// RDPMC. +def : InstRW<[WriteMicrocoded], (instrs RDPMC)>; + +// RDRAND. +def : InstRW<[WriteMicrocoded], (instregex "RDRAND(16|32|64)r")>; + +// XGETBV. +def : InstRW<[WriteMicrocoded], (instregex "XGETBV")>; + +//-- String instructions --// +// CMPS. +def : InstRW<[WriteMicrocoded], (instregex "CMPS(B|L|Q|W)")>; + +// LODSB/W. +def : InstRW<[WriteMicrocoded], (instregex "LODS(B|W)")>; + +// LODSD/Q. +def : InstRW<[WriteMicrocoded], (instregex "LODS(L|Q)")>; + +// MOVS. +def : InstRW<[WriteMicrocoded], (instregex "MOVS(B|L|Q|W)")>; + +// SCAS. +def : InstRW<[WriteMicrocoded], (instregex "SCAS(B|W|L|Q)")>; + +// STOS +def : InstRW<[WriteMicrocoded], (instregex "STOS(B|L|Q|W)")>; + +// XADD. +def : InstRW<[WriteMicrocoded], (instregex "XADD(8|16|32|64)rm")>; + +//=== Floating Point x87 Instructions ===// +//-- Move instructions --// + +def ZnWriteFLDr : SchedWriteRes<[ZnFPU13]> ; + +def ZnWriteSTr: SchedWriteRes<[ZnFPU23]> { + let Latency = 5; + let NumMicroOps = 2; +} + +// LD_F. +// r. +def : InstRW<[ZnWriteFLDr], (instregex "LD_Frr")>; + +// m. +def ZnWriteLD_F80m : SchedWriteRes<[ZnAGU, ZnFPU13]> { + let NumMicroOps = 2; +} +def : InstRW<[ZnWriteLD_F80m], (instregex "LD_F80m")>; + +// FBLD. +def : InstRW<[WriteMicrocoded], (instregex "FBLDm")>; + +// FST(P). +// r. +def : InstRW<[ZnWriteSTr], (instregex "ST_(F|FP)rr")>; + +// m80. +def ZnWriteST_FP80m : SchedWriteRes<[ZnAGU, ZnFPU23]> { + let Latency = 5; +} +def : InstRW<[ZnWriteST_FP80m], (instregex "ST_FP80m")>; + +// FBSTP. +// m80. +def : InstRW<[WriteMicrocoded], (instregex "FBSTPm")>; + +def ZnWriteFXCH : SchedWriteRes<[ZnFPU]>; + +// FXCHG. +def : InstRW<[ZnWriteFXCH], (instrs XCH_F)>; + +// FILD. +def ZnWriteFILD : SchedWriteRes<[ZnAGU, ZnFPU3]> { + let Latency = 11; + let NumMicroOps = 2; +} +def : InstRW<[ZnWriteFILD], (instregex "ILD_F(16|32|64)m")>; + +// FIST(P) FISTTP. +def ZnWriteFIST : SchedWriteRes<[ZnAGU, ZnFPU23]> { + let Latency = 12; +} +def : InstRW<[ZnWriteFIST], (instregex "IS(T|TT)_(F|FP)(16|32|64)m")>; + +def ZnWriteFPU13 : SchedWriteRes<[ZnAGU, ZnFPU13]> { + let Latency = 8; +} + +def ZnWriteFPU3 : SchedWriteRes<[ZnAGU, ZnFPU3]> { + let Latency = 11; +} + +// FLDZ. +def : SchedAlias<WriteFLD0, ZnWriteFPU13>; + +// FLD1. +def : SchedAlias<WriteFLD1, ZnWriteFPU3>; + +// FLDPI FLDL2E etc. +def : SchedAlias<WriteFLDC, ZnWriteFPU3>; + +// FNSTSW. +// AX. +def : InstRW<[WriteMicrocoded], (instrs FNSTSW16r)>; + +// m16. +def : InstRW<[WriteMicrocoded], (instrs FNSTSWm)>; + +// FLDCW. +def : InstRW<[WriteMicrocoded], (instrs FLDCW16m)>; + +// FNSTCW. +def : InstRW<[WriteMicrocoded], (instrs FNSTCW16m)>; + +// FINCSTP FDECSTP. +def : InstRW<[ZnWriteFPU3], (instrs FINCSTP, FDECSTP)>; + +// FFREE. +def : InstRW<[ZnWriteFPU3], (instregex "FFREE")>; + +// FNSAVE. +def : InstRW<[WriteMicrocoded], (instregex "FSAVEm")>; + +// FRSTOR. +def : InstRW<[WriteMicrocoded], (instregex "FRSTORm")>; + +//-- Arithmetic instructions --// + +def ZnWriteFPU3Lat1 : SchedWriteRes<[ZnFPU3]> ; + +def ZnWriteFPU0Lat1 : SchedWriteRes<[ZnFPU0]> ; + +def ZnWriteFPU0Lat1Ld : SchedWriteRes<[ZnAGU, ZnFPU0]> { + let Latency = 8; +} + +// FCHS. +def : InstRW<[ZnWriteFPU3Lat1], (instregex "CHS_F")>; + +// FCOM(P) FUCOM(P). +// r. +def : InstRW<[ZnWriteFPU0Lat1], (instregex "COM(P?)_FST0r", "UCOM_F(P?)r")>; +// m. +def : InstRW<[ZnWriteFPU0Lat1Ld], (instregex "FCOM(P?)(32|64)m")>; + +// FCOMPP FUCOMPP. +// r. +def : InstRW<[ZnWriteFPU0Lat1], (instrs FCOMPP, UCOM_FPPr)>; + +def ZnWriteFPU02 : SchedWriteRes<[ZnAGU, ZnFPU02]> +{ + let Latency = 9; +} + +// FCOMI(P) FUCOMI(P). +// m. +def : InstRW<[ZnWriteFPU02], (instrs COM_FIPr, COM_FIr, UCOM_FIPr, UCOM_FIr)>; + +def ZnWriteFPU03 : SchedWriteRes<[ZnAGU, ZnFPU03]> +{ + let Latency = 12; + let NumMicroOps = 2; + let ResourceCycles = [1,3]; +} + +// FICOM(P). +def : InstRW<[ZnWriteFPU03], (instregex "FICOM(P?)(16|32)m")>; + +// FTST. +def : InstRW<[ZnWriteFPU0Lat1], (instregex "TST_F")>; + +// FXAM. +def : InstRW<[ZnWriteFPU3Lat1], (instrs FXAM)>; + +// FPREM. +def : InstRW<[WriteMicrocoded], (instrs FPREM)>; + +// FPREM1. +def : InstRW<[WriteMicrocoded], (instrs FPREM1)>; + +// FRNDINT. +def : InstRW<[WriteMicrocoded], (instrs FRNDINT)>; + +// FSCALE. +def : InstRW<[WriteMicrocoded], (instrs FSCALE)>; + +// FXTRACT. +def : InstRW<[WriteMicrocoded], (instrs FXTRACT)>; + +// FNOP. +def : InstRW<[ZnWriteFPU0Lat1], (instrs FNOP)>; + +// WAIT. +def : InstRW<[ZnWriteFPU0Lat1], (instrs WAIT)>; + +// FNCLEX. +def : InstRW<[WriteMicrocoded], (instrs FNCLEX)>; + +// FNINIT. +def : InstRW<[WriteMicrocoded], (instrs FNINIT)>; + +//=== Integer MMX and XMM Instructions ===// + +// PACKSSWB/DW. +// mm <- mm. +def ZnWriteFPU12 : SchedWriteRes<[ZnFPU12]> ; +def ZnWriteFPU12Y : SchedWriteRes<[ZnFPU12]> { + let NumMicroOps = 2; +} +def ZnWriteFPU12m : SchedWriteRes<[ZnAGU, ZnFPU12]> ; +def ZnWriteFPU12Ym : SchedWriteRes<[ZnAGU, ZnFPU12]> { + let Latency = 8; + let NumMicroOps = 2; +} + +def : InstRW<[ZnWriteFPU12], (instrs MMX_PACKSSDWirr, + MMX_PACKSSWBirr, + MMX_PACKUSWBirr)>; +def : InstRW<[ZnWriteFPU12m], (instrs MMX_PACKSSDWirm, + MMX_PACKSSWBirm, + MMX_PACKUSWBirm)>; + +// VPMOVSX/ZX BW BD BQ WD WQ DQ. +// y <- x. +def : InstRW<[ZnWriteFPU12Y], (instregex "VPMOV(SX|ZX)(BW|BD|BQ|WD|WQ|DQ)Yrr")>; +def : InstRW<[ZnWriteFPU12Ym], (instregex "VPMOV(SX|ZX)(BW|BD|BQ|WD|WQ|DQ)Yrm")>; + +def ZnWriteFPU013 : SchedWriteRes<[ZnFPU013]> ; +def ZnWriteFPU013Y : SchedWriteRes<[ZnFPU013]> { + let Latency = 2; +} +def ZnWriteFPU013m : SchedWriteRes<[ZnAGU, ZnFPU013]> { + let Latency = 8; + let NumMicroOps = 2; +} +def ZnWriteFPU013Ld : SchedWriteRes<[ZnAGU, ZnFPU013]> { + let Latency = 8; + let NumMicroOps = 2; +} +def ZnWriteFPU013LdY : SchedWriteRes<[ZnAGU, ZnFPU013]> { + let Latency = 9; + let NumMicroOps = 2; +} + +// PBLENDW. +// x,x,i / v,v,v,i +def : InstRW<[ZnWriteFPU013], (instregex "(V?)PBLENDWrri")>; +// ymm +def : InstRW<[ZnWriteFPU013Y], (instrs VPBLENDWYrri)>; + +// x,m,i / v,v,m,i +def : InstRW<[ZnWriteFPU013Ld], (instregex "(V?)PBLENDWrmi")>; +// y,m,i +def : InstRW<[ZnWriteFPU013LdY], (instrs VPBLENDWYrmi)>; + +def ZnWriteFPU01 : SchedWriteRes<[ZnFPU01]> ; +def ZnWriteFPU01Y : SchedWriteRes<[ZnFPU01]> { + let NumMicroOps = 2; +} + +// VPBLENDD. +// v,v,v,i. +def : InstRW<[ZnWriteFPU01], (instrs VPBLENDDrri)>; +// ymm +def : InstRW<[ZnWriteFPU01Y], (instrs VPBLENDDYrri)>; + +// v,v,m,i +def ZnWriteFPU01Op2 : SchedWriteRes<[ZnAGU, ZnFPU01]> { + let NumMicroOps = 2; + let Latency = 8; + let ResourceCycles = [1, 2]; +} +def ZnWriteFPU01Op2Y : SchedWriteRes<[ZnAGU, ZnFPU01]> { + let NumMicroOps = 2; + let Latency = 9; + let ResourceCycles = [1, 3]; +} +def : InstRW<[ZnWriteFPU01Op2], (instrs VPBLENDDrmi)>; +def : InstRW<[ZnWriteFPU01Op2Y], (instrs VPBLENDDYrmi)>; + +// MASKMOVQ. +def : InstRW<[WriteMicrocoded], (instregex "MMX_MASKMOVQ(64)?")>; + +// MASKMOVDQU. +def : InstRW<[WriteMicrocoded], (instregex "(V?)MASKMOVDQU(64)?")>; + +// VPMASKMOVD. +// ymm +def : InstRW<[WriteMicrocoded], + (instregex "VPMASKMOVD(Y?)rm")>; +// m, v,v. +def : InstRW<[WriteMicrocoded], (instregex "VPMASKMOV(D|Q)(Y?)mr")>; + +// VPBROADCAST B/W. +// x, m8/16. +def ZnWriteVPBROADCAST128Ld : SchedWriteRes<[ZnAGU, ZnFPU12]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1, 2]; +} +def : InstRW<[ZnWriteVPBROADCAST128Ld], + (instregex "VPBROADCAST(B|W)rm")>; + +// y, m8/16 +def ZnWriteVPBROADCAST256Ld : SchedWriteRes<[ZnAGU, ZnFPU1]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1, 2]; +} +def : InstRW<[ZnWriteVPBROADCAST256Ld], + (instregex "VPBROADCAST(B|W)Yrm")>; + +// VPGATHER. +def : InstRW<[WriteMicrocoded], (instregex "VPGATHER(Q|D)(Q|D)(Y?)rm")>; + +//-- Arithmetic instructions --// + +// HADD, HSUB PS/PD +// PHADD|PHSUB (S) W/D. +def : SchedAlias<WritePHAdd, ZnWriteMicrocoded>; +def : SchedAlias<WritePHAddLd, ZnWriteMicrocoded>; +def : SchedAlias<WritePHAddX, ZnWriteMicrocoded>; +def : SchedAlias<WritePHAddXLd, ZnWriteMicrocoded>; +def : SchedAlias<WritePHAddY, ZnWriteMicrocoded>; +def : SchedAlias<WritePHAddYLd, ZnWriteMicrocoded>; + +// PCMPGTQ. +def ZnWritePCMPGTQr : SchedWriteRes<[ZnFPU03]>; +def : InstRW<[ZnWritePCMPGTQr], (instregex "(V?)PCMPGTQ(Y?)rr")>; + +// x <- x,m. +def ZnWritePCMPGTQm : SchedWriteRes<[ZnAGU, ZnFPU03]> { + let Latency = 8; +} +// ymm. +def ZnWritePCMPGTQYm : SchedWriteRes<[ZnAGU, ZnFPU03]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,2]; +} +def : InstRW<[ZnWritePCMPGTQm], (instregex "(V?)PCMPGTQrm")>; +def : InstRW<[ZnWritePCMPGTQYm], (instrs VPCMPGTQYrm)>; + +//-- Logic instructions --// + +// PSLL,PSRL,PSRA W/D/Q. +// x,x / v,v,x. +def ZnWritePShift : SchedWriteRes<[ZnFPU2]> ; +def ZnWritePShiftY : SchedWriteRes<[ZnFPU2]> { + let Latency = 2; +} + +// PSLL,PSRL DQ. +def : InstRW<[ZnWritePShift], (instregex "(V?)PS(R|L)LDQri")>; +def : InstRW<[ZnWritePShiftY], (instregex "(V?)PS(R|L)LDQYri")>; + +//=== Floating Point XMM and YMM Instructions ===// +//-- Move instructions --// + +// VPERM2F128. +def : InstRW<[WriteMicrocoded], (instrs VPERM2F128rr)>; +def : InstRW<[WriteMicrocoded], (instrs VPERM2F128rm)>; + +def ZnWriteBROADCAST : SchedWriteRes<[ZnAGU, ZnFPU13]> { + let NumMicroOps = 2; + let Latency = 8; +} +// VBROADCASTF128. +def : InstRW<[ZnWriteBROADCAST], (instrs VBROADCASTF128)>; + +// EXTRACTPS. +// r32,x,i. +def ZnWriteEXTRACTPSr : SchedWriteRes<[ZnFPU12, ZnFPU2]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1, 2]; +} +def : InstRW<[ZnWriteEXTRACTPSr], (instregex "(V?)EXTRACTPSrr")>; + +def ZnWriteEXTRACTPSm : SchedWriteRes<[ZnAGU,ZnFPU12, ZnFPU2]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [5, 1, 2]; +} +// m32,x,i. +def : InstRW<[ZnWriteEXTRACTPSm], (instregex "(V?)EXTRACTPSmr")>; + +// VEXTRACTF128. +// x,y,i. +def : InstRW<[ZnWriteFPU013], (instrs VEXTRACTF128rr)>; + +// m128,y,i. +def : InstRW<[ZnWriteFPU013m], (instrs VEXTRACTF128mr)>; + +def ZnWriteVINSERT128r: SchedWriteRes<[ZnFPU013]> { + let Latency = 2; + let ResourceCycles = [2]; +} +def ZnWriteVINSERT128Ld: SchedWriteRes<[ZnAGU,ZnFPU013]> { + let Latency = 9; + let NumMicroOps = 2; + let ResourceCycles = [1, 2]; +} +// VINSERTF128. +// y,y,x,i. +def : InstRW<[ZnWriteVINSERT128r], (instrs VINSERTF128rr)>; +def : InstRW<[ZnWriteVINSERT128Ld], (instrs VINSERTF128rm)>; + +// VGATHER. +def : InstRW<[WriteMicrocoded], (instregex "VGATHER(Q|D)(PD|PS)(Y?)rm")>; + +//-- Conversion instructions --// +def ZnWriteCVTPD2PSr: SchedWriteRes<[ZnFPU3]> { + let Latency = 4; +} +def ZnWriteCVTPD2PSYr: SchedWriteRes<[ZnFPU3]> { + let Latency = 5; +} + +// CVTPD2PS. +// x,x. +def : SchedAlias<WriteCvtPD2PS, ZnWriteCVTPD2PSr>; +// y,y. +def : SchedAlias<WriteCvtPD2PSY, ZnWriteCVTPD2PSYr>; +// z,z. +defm : X86WriteResUnsupported<WriteCvtPD2PSZ>; + +def ZnWriteCVTPD2PSLd: SchedWriteRes<[ZnAGU,ZnFPU03]> { + let Latency = 11; + let NumMicroOps = 2; + let ResourceCycles = [1,2]; +} +// x,m128. +def : SchedAlias<WriteCvtPD2PSLd, ZnWriteCVTPD2PSLd>; + +// x,m256. +def ZnWriteCVTPD2PSYLd : SchedWriteRes<[ZnAGU, ZnFPU3]> { + let Latency = 11; +} +def : SchedAlias<WriteCvtPD2PSYLd, ZnWriteCVTPD2PSYLd>; +// z,m512 +defm : X86WriteResUnsupported<WriteCvtPD2PSZLd>; + +// CVTSD2SS. +// x,x. +// Same as WriteCVTPD2PSr +def : SchedAlias<WriteCvtSD2SS, ZnWriteCVTPD2PSr>; + +// x,m64. +def : SchedAlias<WriteCvtSD2SSLd, ZnWriteCVTPD2PSLd>; + +// CVTPS2PD. +// x,x. +def ZnWriteCVTPS2PDr : SchedWriteRes<[ZnFPU3]> { + let Latency = 3; +} +def : SchedAlias<WriteCvtPS2PD, ZnWriteCVTPS2PDr>; + +// x,m64. +// y,m128. +def ZnWriteCVTPS2PDLd : SchedWriteRes<[ZnAGU, ZnFPU3]> { + let Latency = 10; + let NumMicroOps = 2; +} +def : SchedAlias<WriteCvtPS2PDLd, ZnWriteCVTPS2PDLd>; +def : SchedAlias<WriteCvtPS2PDYLd, ZnWriteCVTPS2PDLd>; +defm : X86WriteResUnsupported<WriteCvtPS2PDZLd>; + +// y,x. +def ZnWriteVCVTPS2PDY : SchedWriteRes<[ZnFPU3]> { + let Latency = 3; +} +def : SchedAlias<WriteCvtPS2PDY, ZnWriteVCVTPS2PDY>; +defm : X86WriteResUnsupported<WriteCvtPS2PDZ>; + +// CVTSS2SD. +// x,x. +def ZnWriteCVTSS2SDr : SchedWriteRes<[ZnFPU3]> { + let Latency = 4; +} +def : SchedAlias<WriteCvtSS2SD, ZnWriteCVTSS2SDr>; + +// x,m32. +def ZnWriteCVTSS2SDLd : SchedWriteRes<[ZnAGU, ZnFPU3]> { + let Latency = 11; + let NumMicroOps = 2; + let ResourceCycles = [1, 2]; +} +def : SchedAlias<WriteCvtSS2SDLd, ZnWriteCVTSS2SDLd>; + +def ZnWriteCVTDQ2PDr: SchedWriteRes<[ZnFPU12,ZnFPU3]> { + let Latency = 5; +} +// CVTDQ2PD. +// x,x. +def : InstRW<[ZnWriteCVTDQ2PDr], (instregex "(V)?CVTDQ2PDrr")>; + +// Same as xmm +// y,x. +def : InstRW<[ZnWriteCVTDQ2PDr], (instrs VCVTDQ2PDYrr)>; + +def ZnWriteCVTPD2DQr: SchedWriteRes<[ZnFPU12, ZnFPU3]> { + let Latency = 5; +} +// CVT(T)PD2DQ. +// x,x. +def : InstRW<[ZnWriteCVTDQ2PDr], (instregex "(V?)CVT(T?)PD2DQrr")>; + +def ZnWriteCVTPD2DQLd: SchedWriteRes<[ZnAGU,ZnFPU12,ZnFPU3]> { + let Latency = 12; + let NumMicroOps = 2; +} +// x,m128. +def : InstRW<[ZnWriteCVTPD2DQLd], (instregex "(V?)CVT(T?)PD2DQrm")>; +// same as xmm handling +// x,y. +def : InstRW<[ZnWriteCVTPD2DQr], (instregex "VCVT(T?)PD2DQYrr")>; +// x,m256. +def : InstRW<[ZnWriteCVTPD2DQLd], (instregex "VCVT(T?)PD2DQYrm")>; + +def ZnWriteCVTPS2PIr: SchedWriteRes<[ZnFPU3]> { + let Latency = 4; +} +// CVT(T)PS2PI. +// mm,x. +def : InstRW<[ZnWriteCVTPS2PIr], (instregex "MMX_CVT(T?)PS2PIirr")>; + +// CVTPI2PD. +// x,mm. +def : InstRW<[ZnWriteCVTPS2PDr], (instrs MMX_CVTPI2PDirr)>; + +// CVT(T)PD2PI. +// mm,x. +def : InstRW<[ZnWriteCVTPS2PIr], (instregex "MMX_CVT(T?)PD2PIirr")>; + +def ZnWriteCVSTSI2SSr: SchedWriteRes<[ZnFPU3]> { + let Latency = 5; +} + +// same as CVTPD2DQr +// CVT(T)SS2SI. +// r32,x. +def : InstRW<[ZnWriteCVTPD2DQr], (instregex "(V?)CVT(T?)SS2SI(64)?rr")>; +// same as CVTPD2DQm +// r32,m32. +def : InstRW<[ZnWriteCVTPD2DQLd], (instregex "(V?)CVT(T?)SS2SI(64)?rm")>; + +def ZnWriteCVSTSI2SDr: SchedWriteRes<[ZnFPU013, ZnFPU3]> { + let Latency = 5; +} +// CVTSI2SD. +// x,r32/64. +def : InstRW<[ZnWriteCVSTSI2SDr], (instregex "(V?)CVTSI(64)?2SDrr")>; + + +def ZnWriteCVSTSI2SIr: SchedWriteRes<[ZnFPU3, ZnFPU2]> { + let Latency = 5; +} +def ZnWriteCVSTSI2SILd: SchedWriteRes<[ZnAGU, ZnFPU3, ZnFPU2]> { + let Latency = 12; +} +// CVTSD2SI. +// r32/64 +def : InstRW<[ZnWriteCVSTSI2SIr], (instregex "(V?)CVT(T?)SD2SI(64)?rr")>; +// r32,m32. +def : InstRW<[ZnWriteCVSTSI2SILd], (instregex "(V?)CVT(T?)SD2SI(64)?rm")>; + +// VCVTPS2PH. +// x,v,i. +def : SchedAlias<WriteCvtPS2PH, ZnWriteMicrocoded>; +def : SchedAlias<WriteCvtPS2PHY, ZnWriteMicrocoded>; +defm : X86WriteResUnsupported<WriteCvtPS2PHZ>; +// m,v,i. +def : SchedAlias<WriteCvtPS2PHSt, ZnWriteMicrocoded>; +def : SchedAlias<WriteCvtPS2PHYSt, ZnWriteMicrocoded>; +defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>; + +// VCVTPH2PS. +// v,x. +def : SchedAlias<WriteCvtPH2PS, ZnWriteMicrocoded>; +def : SchedAlias<WriteCvtPH2PSY, ZnWriteMicrocoded>; +defm : X86WriteResUnsupported<WriteCvtPH2PSZ>; +// v,m. +def : SchedAlias<WriteCvtPH2PSLd, ZnWriteMicrocoded>; +def : SchedAlias<WriteCvtPH2PSYLd, ZnWriteMicrocoded>; +defm : X86WriteResUnsupported<WriteCvtPH2PSZLd>; + +//-- SSE4A instructions --// +// EXTRQ +def ZnWriteEXTRQ: SchedWriteRes<[ZnFPU12, ZnFPU2]> { + let Latency = 2; +} +def : InstRW<[ZnWriteEXTRQ], (instregex "EXTRQ")>; + +// INSERTQ +def ZnWriteINSERTQ: SchedWriteRes<[ZnFPU03,ZnFPU1]> { + let Latency = 4; +} +def : InstRW<[ZnWriteINSERTQ], (instregex "INSERTQ")>; + +//-- SHA instructions --// +// SHA256MSG2 +def : InstRW<[WriteMicrocoded], (instregex "SHA256MSG2(Y?)r(r|m)")>; + +// SHA1MSG1, SHA256MSG1 +// x,x. +def ZnWriteSHA1MSG1r : SchedWriteRes<[ZnFPU12]> { + let Latency = 2; + let ResourceCycles = [2]; +} +def : InstRW<[ZnWriteSHA1MSG1r], (instregex "SHA(1|256)MSG1rr")>; +// x,m. +def ZnWriteSHA1MSG1Ld : SchedWriteRes<[ZnAGU, ZnFPU12]> { + let Latency = 9; + let ResourceCycles = [1,2]; +} +def : InstRW<[ZnWriteSHA1MSG1Ld], (instregex "SHA(1|256)MSG1rm")>; + +// SHA1MSG2 +// x,x. +def ZnWriteSHA1MSG2r : SchedWriteRes<[ZnFPU12]> ; +def : InstRW<[ZnWriteSHA1MSG2r], (instregex "SHA1MSG2rr")>; +// x,m. +def ZnWriteSHA1MSG2Ld : SchedWriteRes<[ZnAGU, ZnFPU12]> { + let Latency = 8; +} +def : InstRW<[ZnWriteSHA1MSG2Ld], (instregex "SHA1MSG2rm")>; + +// SHA1NEXTE +// x,x. +def ZnWriteSHA1NEXTEr : SchedWriteRes<[ZnFPU1]> ; +def : InstRW<[ZnWriteSHA1NEXTEr], (instregex "SHA1NEXTErr")>; +// x,m. +def ZnWriteSHA1NEXTELd : SchedWriteRes<[ZnAGU, ZnFPU1]> { + let Latency = 8; +} +def : InstRW<[ZnWriteSHA1NEXTELd], (instregex "SHA1NEXTErm")>; + +// SHA1RNDS4 +// x,x. +def ZnWriteSHA1RNDS4r : SchedWriteRes<[ZnFPU1]> { + let Latency = 6; +} +def : InstRW<[ZnWriteSHA1RNDS4r], (instregex "SHA1RNDS4rr")>; +// x,m. +def ZnWriteSHA1RNDS4Ld : SchedWriteRes<[ZnAGU, ZnFPU1]> { + let Latency = 13; +} +def : InstRW<[ZnWriteSHA1RNDS4Ld], (instregex "SHA1RNDS4rm")>; + +// SHA256RNDS2 +// x,x. +def ZnWriteSHA256RNDS2r : SchedWriteRes<[ZnFPU1]> { + let Latency = 4; +} +def : InstRW<[ZnWriteSHA256RNDS2r], (instregex "SHA256RNDS2rr")>; +// x,m. +def ZnWriteSHA256RNDS2Ld : SchedWriteRes<[ZnAGU, ZnFPU1]> { + let Latency = 11; +} +def : InstRW<[ZnWriteSHA256RNDS2Ld], (instregex "SHA256RNDS2rm")>; + +//-- Arithmetic instructions --// + +// HADD, HSUB PS/PD +def : SchedAlias<WriteFHAdd, ZnWriteMicrocoded>; +def : SchedAlias<WriteFHAddLd, ZnWriteMicrocoded>; +def : SchedAlias<WriteFHAddY, ZnWriteMicrocoded>; +def : SchedAlias<WriteFHAddYLd, ZnWriteMicrocoded>; + +// VDIVPS. +// TODO - convert to ZnWriteResFpuPair +// y,y,y. +def ZnWriteVDIVPSYr : SchedWriteRes<[ZnFPU3]> { + let Latency = 12; + let ResourceCycles = [12]; +} +def : SchedAlias<WriteFDivY, ZnWriteVDIVPSYr>; + +// y,y,m256. +def ZnWriteVDIVPSYLd : SchedWriteRes<[ZnAGU, ZnFPU3]> { + let Latency = 19; + let NumMicroOps = 2; + let ResourceCycles = [1, 19]; +} +def : SchedAlias<WriteFDivYLd, ZnWriteVDIVPSYLd>; + +// VDIVPD. +// TODO - convert to ZnWriteResFpuPair +// y,y,y. +def ZnWriteVDIVPDY : SchedWriteRes<[ZnFPU3]> { + let Latency = 15; + let ResourceCycles = [15]; +} +def : SchedAlias<WriteFDiv64Y, ZnWriteVDIVPDY>; + +// y,y,m256. +def ZnWriteVDIVPDYLd : SchedWriteRes<[ZnAGU, ZnFPU3]> { + let Latency = 22; + let NumMicroOps = 2; + let ResourceCycles = [1,22]; +} +def : SchedAlias<WriteFDiv64YLd, ZnWriteVDIVPDYLd>; + +// DPPS. +// x,x,i / v,v,v,i. +def : SchedAlias<WriteDPPS, ZnWriteMicrocoded>; +def : SchedAlias<WriteDPPSY, ZnWriteMicrocoded>; + +// x,m,i / v,v,m,i. +def : SchedAlias<WriteDPPSLd, ZnWriteMicrocoded>; +def : SchedAlias<WriteDPPSYLd,ZnWriteMicrocoded>; + +// DPPD. +// x,x,i. +def : SchedAlias<WriteDPPD, ZnWriteMicrocoded>; + +// x,m,i. +def : SchedAlias<WriteDPPDLd, ZnWriteMicrocoded>; + +// RSQRTSS +// TODO - convert to ZnWriteResFpuPair +// x,x. +def ZnWriteRSQRTSSr : SchedWriteRes<[ZnFPU02]> { + let Latency = 5; +} +def : SchedAlias<WriteFRsqrt, ZnWriteRSQRTSSr>; + +// x,m128. +def ZnWriteRSQRTSSLd: SchedWriteRes<[ZnAGU, ZnFPU02]> { + let Latency = 12; + let NumMicroOps = 2; + let ResourceCycles = [1,2]; // FIXME: Is this right? +} +def : SchedAlias<WriteFRsqrtLd, ZnWriteRSQRTSSLd>; + +// RSQRTPS +// TODO - convert to ZnWriteResFpuPair +// y,y. +def ZnWriteRSQRTPSYr : SchedWriteRes<[ZnFPU01]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def : SchedAlias<WriteFRsqrtY, ZnWriteRSQRTPSYr>; + +// y,m256. +def ZnWriteRSQRTPSYLd : SchedWriteRes<[ZnAGU, ZnFPU01]> { + let Latency = 12; + let NumMicroOps = 2; +} +def : SchedAlias<WriteFRsqrtYLd, ZnWriteRSQRTPSYLd>; + +//-- Other instructions --// + +// VZEROUPPER. +def : InstRW<[WriteMicrocoded], (instrs VZEROUPPER)>; + +// VZEROALL. +def : InstRW<[WriteMicrocoded], (instrs VZEROALL)>; + +} // SchedModel diff --git a/capstone/suite/synctools/tablegen/X86/back/X86_reduce.td b/capstone/suite/synctools/tablegen/X86/back/X86_reduce.td new file mode 100644 index 000000000..cf2ce6890 --- /dev/null +++ b/capstone/suite/synctools/tablegen/X86/back/X86_reduce.td @@ -0,0 +1,459 @@ +//===-- X86.td - Target definition file for the Intel X86 --*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This is a target description file for the Intel i386 architecture, referred +// to here as the "X86" architecture. +// +//===----------------------------------------------------------------------===// + +// Get the target-independent interfaces which we are implementing... +// +include "llvm/Target/Target.td" + +//===----------------------------------------------------------------------===// +// X86 Subtarget state +// + +def Mode64Bit : SubtargetFeature<"64bit-mode", "In64BitMode", "true", + "64-bit mode (x86_64)">; +def Mode32Bit : SubtargetFeature<"32bit-mode", "In32BitMode", "true", + "32-bit mode (80386)">; +def Mode16Bit : SubtargetFeature<"16bit-mode", "In16BitMode", "true", + "16-bit mode (i8086)">; + +//===----------------------------------------------------------------------===// +// X86 Subtarget features +//===----------------------------------------------------------------------===// + +def FeatureX87 : SubtargetFeature<"x87","HasX87", "true", + "Enable X87 float instructions">; + +def FeatureNOPL : SubtargetFeature<"nopl", "HasNOPL", "true", + "Enable NOPL instruction">; + +def FeatureCMOV : SubtargetFeature<"cmov","HasCMov", "true", + "Enable conditional move instructions">; + +def FeaturePOPCNT : SubtargetFeature<"popcnt", "HasPOPCNT", "true", + "Support POPCNT instruction">; + +def FeatureFXSR : SubtargetFeature<"fxsr", "HasFXSR", "true", + "Support fxsave/fxrestore instructions">; + +def FeatureXSAVE : SubtargetFeature<"xsave", "HasXSAVE", "true", + "Support xsave instructions">; + +def FeatureXSAVEOPT: SubtargetFeature<"xsaveopt", "HasXSAVEOPT", "true", + "Support xsaveopt instructions">; + +def FeatureXSAVEC : SubtargetFeature<"xsavec", "HasXSAVEC", "true", + "Support xsavec instructions">; + +def FeatureXSAVES : SubtargetFeature<"xsaves", "HasXSAVES", "true", + "Support xsaves instructions">; + +def FeatureSSE1 : SubtargetFeature<"sse", "X86SSELevel", "SSE1", + "Enable SSE instructions", + // SSE codegen depends on cmovs, and all + // SSE1+ processors support them. + [FeatureCMOV]>; +def FeatureSSE2 : SubtargetFeature<"sse2", "X86SSELevel", "SSE2", + "Enable SSE2 instructions", + [FeatureSSE1]>; +def FeatureSSE3 : SubtargetFeature<"sse3", "X86SSELevel", "SSE3", + "Enable SSE3 instructions", + [FeatureSSE2]>; +def FeatureSSSE3 : SubtargetFeature<"ssse3", "X86SSELevel", "SSSE3", + "Enable SSSE3 instructions", + [FeatureSSE3]>; +def FeatureSSE41 : SubtargetFeature<"sse4.1", "X86SSELevel", "SSE41", + "Enable SSE 4.1 instructions", + [FeatureSSSE3]>; +def FeatureSSE42 : SubtargetFeature<"sse4.2", "X86SSELevel", "SSE42", + "Enable SSE 4.2 instructions", + [FeatureSSE41]>; +// The MMX subtarget feature is separate from the rest of the SSE features +// because it's important (for odd compatibility reasons) to be able to +// turn it off explicitly while allowing SSE+ to be on. +def FeatureMMX : SubtargetFeature<"mmx","X863DNowLevel", "MMX", + "Enable MMX instructions">; +def Feature3DNow : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow", + "Enable 3DNow! instructions", + [FeatureMMX]>; +def Feature3DNowA : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA", + "Enable 3DNow! Athlon instructions", + [Feature3DNow]>; +// All x86-64 hardware has SSE2, but we don't mark SSE2 as an implied +// feature, because SSE2 can be disabled (e.g. for compiling OS kernels) +// without disabling 64-bit mode. +def Feature64Bit : SubtargetFeature<"64bit", "HasX86_64", "true", + "Support 64-bit instructions", + [FeatureCMOV]>; +def FeatureCMPXCHG16B : SubtargetFeature<"cx16", "HasCmpxchg16b", "true", + "64-bit with cmpxchg16b", + [Feature64Bit]>; +def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true", + "SHLD instruction is slow">; +def FeatureSlowPMULLD : SubtargetFeature<"slow-pmulld", "IsPMULLDSlow", "true", + "PMULLD instruction is slow">; +// FIXME: This should not apply to CPUs that do not have SSE. +def FeatureSlowUAMem16 : SubtargetFeature<"slow-unaligned-mem-16", + "IsUAMem16Slow", "true", + "Slow unaligned 16-byte memory access">; +def FeatureSlowUAMem32 : SubtargetFeature<"slow-unaligned-mem-32", + "IsUAMem32Slow", "true", + "Slow unaligned 32-byte memory access">; +def FeatureSSE4A : SubtargetFeature<"sse4a", "HasSSE4A", "true", + "Support SSE 4a instructions", + [FeatureSSE3]>; + +def FeatureAVX : SubtargetFeature<"avx", "X86SSELevel", "AVX", + "Enable AVX instructions", + [FeatureSSE42]>; +def FeatureAVX2 : SubtargetFeature<"avx2", "X86SSELevel", "AVX2", + "Enable AVX2 instructions", + [FeatureAVX]>; +def FeatureFMA : SubtargetFeature<"fma", "HasFMA", "true", + "Enable three-operand fused multiple-add", + [FeatureAVX]>; +def FeatureF16C : SubtargetFeature<"f16c", "HasF16C", "true", + "Support 16-bit floating point conversion instructions", + [FeatureAVX]>; +def FeatureAVX512 : SubtargetFeature<"avx512f", "X86SSELevel", "AVX512F", + "Enable AVX-512 instructions", + [FeatureAVX2, FeatureFMA, FeatureF16C]>; +def FeatureERI : SubtargetFeature<"avx512er", "HasERI", "true", + "Enable AVX-512 Exponential and Reciprocal Instructions", + [FeatureAVX512]>; +def FeatureCDI : SubtargetFeature<"avx512cd", "HasCDI", "true", + "Enable AVX-512 Conflict Detection Instructions", + [FeatureAVX512]>; +def FeatureVPOPCNTDQ : SubtargetFeature<"avx512vpopcntdq", "HasVPOPCNTDQ", + "true", "Enable AVX-512 Population Count Instructions", + [FeatureAVX512]>; +def FeaturePFI : SubtargetFeature<"avx512pf", "HasPFI", "true", + "Enable AVX-512 PreFetch Instructions", + [FeatureAVX512]>; +def FeaturePREFETCHWT1 : SubtargetFeature<"prefetchwt1", "HasPREFETCHWT1", + "true", + "Prefetch with Intent to Write and T1 Hint">; +def FeatureDQI : SubtargetFeature<"avx512dq", "HasDQI", "true", + "Enable AVX-512 Doubleword and Quadword Instructions", + [FeatureAVX512]>; +def FeatureBWI : SubtargetFeature<"avx512bw", "HasBWI", "true", + "Enable AVX-512 Byte and Word Instructions", + [FeatureAVX512]>; +def FeatureVLX : SubtargetFeature<"avx512vl", "HasVLX", "true", + "Enable AVX-512 Vector Length eXtensions", + [FeatureAVX512]>; +def FeatureVBMI : SubtargetFeature<"avx512vbmi", "HasVBMI", "true", + "Enable AVX-512 Vector Byte Manipulation Instructions", + [FeatureBWI]>; +def FeatureVBMI2 : SubtargetFeature<"avx512vbmi2", "HasVBMI2", "true", + "Enable AVX-512 further Vector Byte Manipulation Instructions", + [FeatureBWI]>; +def FeatureIFMA : SubtargetFeature<"avx512ifma", "HasIFMA", "true", + "Enable AVX-512 Integer Fused Multiple-Add", + [FeatureAVX512]>; +def FeaturePKU : SubtargetFeature<"pku", "HasPKU", "true", + "Enable protection keys">; +def FeatureVNNI : SubtargetFeature<"avx512vnni", "HasVNNI", "true", + "Enable AVX-512 Vector Neural Network Instructions", + [FeatureAVX512]>; +def FeatureBITALG : SubtargetFeature<"avx512bitalg", "HasBITALG", "true", + "Enable AVX-512 Bit Algorithms", + [FeatureBWI]>; +def FeaturePCLMUL : SubtargetFeature<"pclmul", "HasPCLMUL", "true", + "Enable packed carry-less multiplication instructions", + [FeatureSSE2]>; +def FeatureGFNI : SubtargetFeature<"gfni", "HasGFNI", "true", + "Enable Galois Field Arithmetic Instructions", + [FeatureSSE2]>; +def FeatureVPCLMULQDQ : SubtargetFeature<"vpclmulqdq", "HasVPCLMULQDQ", "true", + "Enable vpclmulqdq instructions", + [FeatureAVX, FeaturePCLMUL]>; +def FeatureFMA4 : SubtargetFeature<"fma4", "HasFMA4", "true", + "Enable four-operand fused multiple-add", + [FeatureAVX, FeatureSSE4A]>; +def FeatureXOP : SubtargetFeature<"xop", "HasXOP", "true", + "Enable XOP instructions", + [FeatureFMA4]>; +def FeatureSSEUnalignedMem : SubtargetFeature<"sse-unaligned-mem", + "HasSSEUnalignedMem", "true", + "Allow unaligned memory operands with SSE instructions">; +def FeatureAES : SubtargetFeature<"aes", "HasAES", "true", + "Enable AES instructions", + [FeatureSSE2]>; +def FeatureVAES : SubtargetFeature<"vaes", "HasVAES", "true", + "Promote selected AES instructions to AVX512/AVX registers", + [FeatureAVX, FeatureAES]>; +def FeatureTBM : SubtargetFeature<"tbm", "HasTBM", "true", + "Enable TBM instructions">; +def FeatureLWP : SubtargetFeature<"lwp", "HasLWP", "true", + "Enable LWP instructions">; +def FeatureMOVBE : SubtargetFeature<"movbe", "HasMOVBE", "true", + "Support MOVBE instruction">; +def FeatureRDRAND : SubtargetFeature<"rdrnd", "HasRDRAND", "true", + "Support RDRAND instruction">; +def FeatureFSGSBase : SubtargetFeature<"fsgsbase", "HasFSGSBase", "true", + "Support FS/GS Base instructions">; +def FeatureLZCNT : SubtargetFeature<"lzcnt", "HasLZCNT", "true", + "Support LZCNT instruction">; +def FeatureBMI : SubtargetFeature<"bmi", "HasBMI", "true", + "Support BMI instructions">; +def FeatureBMI2 : SubtargetFeature<"bmi2", "HasBMI2", "true", + "Support BMI2 instructions">; +def FeatureRTM : SubtargetFeature<"rtm", "HasRTM", "true", + "Support RTM instructions">; +def FeatureADX : SubtargetFeature<"adx", "HasADX", "true", + "Support ADX instructions">; +def FeatureSHA : SubtargetFeature<"sha", "HasSHA", "true", + "Enable SHA instructions", + [FeatureSSE2]>; +def FeatureSHSTK : SubtargetFeature<"shstk", "HasSHSTK", "true", + "Support CET Shadow-Stack instructions">; +def FeaturePRFCHW : SubtargetFeature<"prfchw", "HasPRFCHW", "true", + "Support PRFCHW instructions">; +def FeatureRDSEED : SubtargetFeature<"rdseed", "HasRDSEED", "true", + "Support RDSEED instruction">; +def FeatureLAHFSAHF : SubtargetFeature<"sahf", "HasLAHFSAHF", "true", + "Support LAHF and SAHF instructions">; +def FeatureMWAITX : SubtargetFeature<"mwaitx", "HasMWAITX", "true", + "Enable MONITORX/MWAITX timer functionality">; +def FeatureCLZERO : SubtargetFeature<"clzero", "HasCLZERO", "true", + "Enable Cache Line Zero">; +def FeatureCLDEMOTE : SubtargetFeature<"cldemote", "HasCLDEMOTE", "true", + "Enable Cache Demote">; +def FeaturePTWRITE : SubtargetFeature<"ptwrite", "HasPTWRITE", "true", + "Support ptwrite instruction">; +def FeatureMPX : SubtargetFeature<"mpx", "HasMPX", "true", + "Support MPX instructions">; +def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true", + "Use LEA for adjusting the stack pointer">; +def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb", + "HasSlowDivide32", "true", + "Use 8-bit divide for positive values less than 256">; +def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divl", + "HasSlowDivide64", "true", + "Use 32-bit divide for positive values less than 2^32">; +def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions", + "PadShortFunctions", "true", + "Pad short functions">; +def FeatureINVPCID : SubtargetFeature<"invpcid", "HasINVPCID", "true", + "Invalidate Process-Context Identifier">; +def FeatureSGX : SubtargetFeature<"sgx", "HasSGX", "true", + "Enable Software Guard Extensions">; +def FeatureCLFLUSHOPT : SubtargetFeature<"clflushopt", "HasCLFLUSHOPT", "true", + "Flush A Cache Line Optimized">; +def FeatureCLWB : SubtargetFeature<"clwb", "HasCLWB", "true", + "Cache Line Write Back">; +def FeatureWBNOINVD : SubtargetFeature<"wbnoinvd", "HasWBNOINVD", "true", + "Write Back No Invalidate">; +def FeatureRDPID : SubtargetFeature<"rdpid", "HasRDPID", "true", + "Support RDPID instructions">; +def FeatureWAITPKG : SubtargetFeature<"waitpkg", "HasWAITPKG", "true", + "Wait and pause enhancements">; +// On some processors, instructions that implicitly take two memory operands are +// slow. In practice, this means that CALL, PUSH, and POP with memory operands +// should be avoided in favor of a MOV + register CALL/PUSH/POP. +def FeatureSlowTwoMemOps : SubtargetFeature<"slow-two-mem-ops", + "SlowTwoMemOps", "true", + "Two memory operand instructions are slow">; +def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true", + "LEA instruction needs inputs at AG stage">; +def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true", + "LEA instruction with certain arguments is slow">; +def FeatureSlow3OpsLEA : SubtargetFeature<"slow-3ops-lea", "Slow3OpsLEA", "true", + "LEA instruction with 3 ops or certain registers is slow">; +def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true", + "INC and DEC instructions are slower than ADD and SUB">; +def FeatureSoftFloat + : SubtargetFeature<"soft-float", "UseSoftFloat", "true", + "Use software floating point features.">; +def FeaturePOPCNTFalseDeps : SubtargetFeature<"false-deps-popcnt", + "HasPOPCNTFalseDeps", "true", + "POPCNT has a false dependency on dest register">; +def FeatureLZCNTFalseDeps : SubtargetFeature<"false-deps-lzcnt-tzcnt", + "HasLZCNTFalseDeps", "true", + "LZCNT/TZCNT have a false dependency on dest register">; +def FeaturePCONFIG : SubtargetFeature<"pconfig", "HasPCONFIG", "true", + "platform configuration instruction">; +// On recent X86 (port bound) processors, its preferable to combine to a single shuffle +// using a variable mask over multiple fixed shuffles. +def FeatureFastVariableShuffle + : SubtargetFeature<"fast-variable-shuffle", + "HasFastVariableShuffle", + "true", "Shuffles with variable masks are fast">; +// On some X86 processors, there is no performance hazard to writing only the +// lower parts of a YMM or ZMM register without clearing the upper part. +def FeatureFastPartialYMMorZMMWrite + : SubtargetFeature<"fast-partial-ymm-or-zmm-write", + "HasFastPartialYMMorZMMWrite", + "true", "Partial writes to YMM/ZMM registers are fast">; +// FeatureFastScalarFSQRT should be enabled if scalar FSQRT has shorter latency +// than the corresponding NR code. FeatureFastVectorFSQRT should be enabled if +// vector FSQRT has higher throughput than the corresponding NR code. +// The idea is that throughput bound code is likely to be vectorized, so for +// vectorized code we should care about the throughput of SQRT operations. +// But if the code is scalar that probably means that the code has some kind of +// dependency and we should care more about reducing the latency. +def FeatureFastScalarFSQRT + : SubtargetFeature<"fast-scalar-fsqrt", "HasFastScalarFSQRT", + "true", "Scalar SQRT is fast (disable Newton-Raphson)">; +def FeatureFastVectorFSQRT + : SubtargetFeature<"fast-vector-fsqrt", "HasFastVectorFSQRT", + "true", "Vector SQRT is fast (disable Newton-Raphson)">; +// If lzcnt has equivalent latency/throughput to most simple integer ops, it can +// be used to replace test/set sequences. +def FeatureFastLZCNT + : SubtargetFeature< + "fast-lzcnt", "HasFastLZCNT", "true", + "LZCNT instructions are as fast as most simple integer ops">; +// If the target can efficiently decode NOPs upto 11-bytes in length. +def FeatureFast11ByteNOP + : SubtargetFeature< + "fast-11bytenop", "HasFast11ByteNOP", "true", + "Target can quickly decode up to 11 byte NOPs">; +// If the target can efficiently decode NOPs upto 15-bytes in length. +def FeatureFast15ByteNOP + : SubtargetFeature< + "fast-15bytenop", "HasFast15ByteNOP", "true", + "Target can quickly decode up to 15 byte NOPs">; +// Sandy Bridge and newer processors can use SHLD with the same source on both +// inputs to implement rotate to avoid the partial flag update of the normal +// rotate instructions. +def FeatureFastSHLDRotate + : SubtargetFeature< + "fast-shld-rotate", "HasFastSHLDRotate", "true", + "SHLD can be used as a faster rotate">; + +// Ivy Bridge and newer processors have enhanced REP MOVSB and STOSB (aka +// "string operations"). See "REP String Enhancement" in the Intel Software +// Development Manual. This feature essentially means that REP MOVSB will copy +// using the largest available size instead of copying bytes one by one, making +// it at least as fast as REPMOVS{W,D,Q}. +def FeatureERMSB + : SubtargetFeature< + "ermsb", "HasERMSB", "true", + "REP MOVS/STOS are fast">; + +// Sandy Bridge and newer processors have many instructions that can be +// fused with conditional branches and pass through the CPU as a single +// operation. +def FeatureMacroFusion + : SubtargetFeature<"macrofusion", "HasMacroFusion", "true", + "Various instructions can be fused with conditional branches">; + +// Gather is available since Haswell (AVX2 set). So technically, we can +// generate Gathers on all AVX2 processors. But the overhead on HSW is high. +// Skylake Client processor has faster Gathers than HSW and performance is +// similar to Skylake Server (AVX-512). +def FeatureHasFastGather + : SubtargetFeature<"fast-gather", "HasFastGather", "true", + "Indicates if gather is reasonably fast.">; + +def FeaturePrefer256Bit + : SubtargetFeature<"prefer-256-bit", "Prefer256Bit", "true", + "Prefer 256-bit AVX instructions">; + +// Enable mitigation of some aspects of speculative execution related +// vulnerabilities by removing speculatable indirect branches. This disables +// jump-table formation, rewrites explicit `indirectbr` instructions into +// `switch` instructions, and uses a special construct called a "retpoline" to +// prevent speculation of the remaining indirect branches (indirect calls and +// tail calls). +def FeatureRetpoline + : SubtargetFeature<"retpoline", "UseRetpoline", "true", + "Remove speculation of indirect branches from the " + "generated code, either by avoiding them entirely or " + "lowering them with a speculation blocking construct.">; + +// Rely on external thunks for the emitted retpoline calls. This allows users +// to provide their own custom thunk definitions in highly specialized +// environments such as a kernel that does boot-time hot patching. +def FeatureRetpolineExternalThunk + : SubtargetFeature< + "retpoline-external-thunk", "UseRetpolineExternalThunk", "true", + "Enable retpoline, but with an externally provided thunk.", + [FeatureRetpoline]>; + +// Direct Move instructions. +def FeatureMOVDIRI : SubtargetFeature<"movdiri", "HasMOVDIRI", "true", + "Support movdiri instruction">; +def FeatureMOVDIR64B : SubtargetFeature<"movdir64b", "HasMOVDIR64B", "true", + "Support movdir64b instruction">; + +//===----------------------------------------------------------------------===// +// Register File Description +//===----------------------------------------------------------------------===// + +include "X86RegisterInfo.td" +include "X86RegisterBanks.td" + +//===----------------------------------------------------------------------===// +// Instruction Descriptions +//===----------------------------------------------------------------------===// + +include "X86Schedule.td" +include "X86InstrInfo_reduce.td" + +def X86InstrInfo : InstrInfo; + +//===----------------------------------------------------------------------===// +// Assembly Parser +//===----------------------------------------------------------------------===// + +def ATTAsmParserVariant : AsmParserVariant { + int Variant = 0; + + // Variant name. + string Name = "att"; + + // Discard comments in assembly strings. + string CommentDelimiter = "#"; + + // Recognize hard coded registers. + string RegisterPrefix = "%"; +} + +def IntelAsmParserVariant : AsmParserVariant { + int Variant = 1; + + // Variant name. + string Name = "intel"; + + // Discard comments in assembly strings. + string CommentDelimiter = ";"; + + // Recognize hard coded registers. + string RegisterPrefix = ""; +} + +//===----------------------------------------------------------------------===// +// Assembly Printers +//===----------------------------------------------------------------------===// + +// The X86 target supports two different syntaxes for emitting machine code. +// This is controlled by the -x86-asm-syntax={att|intel} +def ATTAsmWriter : AsmWriter { + string AsmWriterClassName = "ATTInstPrinter"; + int Variant = 0; +} +def IntelAsmWriter : AsmWriter { + string AsmWriterClassName = "IntelInstPrinter"; + int Variant = 1; +} + +def X86 : Target { + // Information about the instructions... + let InstructionSet = X86InstrInfo; + let AssemblyParserVariants = [ATTAsmParserVariant, IntelAsmParserVariant]; + let AssemblyWriters = [ATTAsmWriter, IntelAsmWriter]; + let AllowRegisterRenaming = 1; +} |