aboutsummaryrefslogtreecommitdiffstats
path: root/capstone/suite/test_mc.py
diff options
context:
space:
mode:
authorAngelos Mouzakitis <a.mouzakitis@virtualopensystems.com>2023-10-10 14:33:42 +0000
committerAngelos Mouzakitis <a.mouzakitis@virtualopensystems.com>2023-10-10 14:33:42 +0000
commitaf1a266670d040d2f4083ff309d732d648afba2a (patch)
tree2fc46203448ddcc6f81546d379abfaeb323575e9 /capstone/suite/test_mc.py
parente02cda008591317b1625707ff8e115a4841aa889 (diff)
Add submodule dependency filesHEADmaster
Change-Id: Iaf8d18082d3991dec7c0ebbea540f092188eb4ec
Diffstat (limited to 'capstone/suite/test_mc.py')
-rwxr-xr-xcapstone/suite/test_mc.py267
1 files changed, 267 insertions, 0 deletions
diff --git a/capstone/suite/test_mc.py b/capstone/suite/test_mc.py
new file mode 100755
index 000000000..096e99fbe
--- /dev/null
+++ b/capstone/suite/test_mc.py
@@ -0,0 +1,267 @@
+#!/usr/bin/python
+# Test tool to compare Capstone output with llvm-mc. By Nguyen Anh Quynh, 2014
+import array, os.path, sys
+from subprocess import Popen, PIPE, STDOUT
+from capstone import *
+
+
+# convert all hex numbers to decimal numbers in a text
+def normalize_hex(a):
+ while(True):
+ i = a.find('0x')
+ if i == -1: # no more hex number
+ break
+ hexnum = '0x'
+ for c in a[i + 2:]:
+ if c in '0123456789abcdefABCDEF':
+ hexnum += c
+ else:
+ break
+ num = int(hexnum, 16)
+ a = a.replace(hexnum, str(num))
+ return a
+
+
+def run_mc(arch, hexcode, option, syntax=None):
+ def normalize(text):
+ # remove tabs
+ text = text.lower()
+ items = text.split()
+ text = ' '.join(items)
+ if arch == CS_ARCH_X86:
+ # remove comment after #
+ i = text.find('# ')
+ if i != -1:
+ return text[:i].strip()
+ if arch == CS_ARCH_ARM64:
+ # remove comment after #
+ i = text.find('// ')
+ if i != -1:
+ return text[:i].strip()
+ # remove some redundant spaces
+ text = text.replace('{ ', '{')
+ text = text.replace(' }', '}')
+ return text.strip()
+
+ #print("Trying to decode: %s" %hexcode)
+ if syntax:
+ if arch == CS_ARCH_MIPS:
+ p = Popen(['llvm-mc', '-disassemble', '-print-imm-hex', '-mattr=+msa', syntax] + option, stdout=PIPE, stdin=PIPE, stderr=STDOUT)
+ else:
+ p = Popen(['llvm-mc', '-disassemble', '-print-imm-hex', syntax] + option, stdout=PIPE, stdin=PIPE, stderr=STDOUT)
+ else:
+ if arch == CS_ARCH_MIPS:
+ p = Popen(['llvm-mc', '-disassemble', '-print-imm-hex', '-mattr=+msa'] + option, stdout=PIPE, stdin=PIPE, stderr=STDOUT)
+ else:
+ p = Popen(['llvm-mc', '-disassemble', '-print-imm-hex'] + option, stdout=PIPE, stdin=PIPE, stderr=STDOUT)
+ output = p.communicate(input=hexcode)[0]
+ lines = output.split('\n')
+ #print lines
+ if 'invalid' in lines[0]:
+ #print 'invalid ----'
+ return 'FAILED to disassemble (MC)'
+ else:
+ #print 'OK:', lines[1]
+ return normalize(lines[1].strip())
+
+def test_file(fname):
+ print("Test %s" %fname);
+ f = open(fname)
+ lines = f.readlines()
+ f.close()
+
+ if not lines[0].startswith('# '):
+ print("ERROR: decoding information is missing")
+ return
+
+ # skip '# ' at the front, then split line to get out hexcode
+ # Note: option can be '', or 'None'
+ #print lines[0]
+ #print lines[0][2:].split(', ')
+ (arch, mode, option) = lines[0][2:].split(', ')
+ mode = mode.replace(' ', '')
+ option = option.strip()
+
+ archs = {
+ "CS_ARCH_ARM": CS_ARCH_ARM,
+ "CS_ARCH_ARM64": CS_ARCH_ARM64,
+ "CS_ARCH_MIPS": CS_ARCH_MIPS,
+ "CS_ARCH_PPC": CS_ARCH_PPC,
+ "CS_ARCH_SPARC": CS_ARCH_SPARC,
+ "CS_ARCH_SYSZ": CS_ARCH_SYSZ,
+ "CS_ARCH_X86": CS_ARCH_X86,
+ "CS_ARCH_XCORE": CS_ARCH_XCORE,
+ "CS_ARCH_RISCV": CS_ARCH_RISCV
+ # "CS_ARCH_M68K": CS_ARCH_M68K,
+ }
+
+ modes = {
+ "CS_MODE_16": CS_MODE_16,
+ "CS_MODE_32": CS_MODE_32,
+ "CS_MODE_64": CS_MODE_64,
+ "CS_MODE_MIPS32": CS_MODE_MIPS32,
+ "CS_MODE_MIPS64": CS_MODE_MIPS64,
+ "0": CS_MODE_ARM,
+ "CS_MODE_ARM": CS_MODE_ARM,
+ "CS_MODE_THUMB": CS_MODE_THUMB,
+ "CS_MODE_ARM+CS_MODE_V8": CS_MODE_ARM+CS_MODE_V8,
+ "CS_MODE_THUMB+CS_MODE_V8": CS_MODE_THUMB+CS_MODE_V8,
+ "CS_MODE_THUMB+CS_MODE_MCLASS": CS_MODE_THUMB+CS_MODE_MCLASS,
+ "CS_MODE_LITTLE_ENDIAN": CS_MODE_LITTLE_ENDIAN,
+ "CS_MODE_BIG_ENDIAN": CS_MODE_BIG_ENDIAN,
+ "CS_MODE_64+CS_MODE_LITTLE_ENDIAN": CS_MODE_64+CS_MODE_LITTLE_ENDIAN,
+ "CS_MODE_64+CS_MODE_BIG_ENDIAN": CS_MODE_64+CS_MODE_BIG_ENDIAN,
+ "CS_MODE_MIPS32+CS_MODE_MICRO": CS_MODE_MIPS32+CS_MODE_MICRO,
+ "CS_MODE_MIPS32+CS_MODE_MICRO+CS_MODE_BIG_ENDIAN": CS_MODE_MIPS32+CS_MODE_MICRO+CS_MODE_BIG_ENDIAN,
+ "CS_MODE_MIPS32+CS_MODE_BIG_ENDIAN+CS_MODE_MICRO": CS_MODE_MIPS32+CS_MODE_MICRO+CS_MODE_BIG_ENDIAN,
+ "CS_MODE_BIG_ENDIAN+CS_MODE_V9": CS_MODE_BIG_ENDIAN + CS_MODE_V9,
+ "CS_MODE_MIPS32+CS_MODE_BIG_ENDIAN": CS_MODE_MIPS32+CS_MODE_BIG_ENDIAN,
+ "CS_MODE_MIPS32+CS_MODE_LITTLE_ENDIAN": CS_MODE_MIPS32+CS_MODE_LITTLE_ENDIAN,
+ "CS_MODE_MIPS64+CS_MODE_LITTLE_ENDIAN": CS_MODE_MIPS64+CS_MODE_LITTLE_ENDIAN,
+ "CS_MODE_MIPS64+CS_MODE_BIG_ENDIAN": CS_MODE_MIPS64+CS_MODE_BIG_ENDIAN,
+ "CS_MODE_RISCV32": CS_MODE_RISCV32,
+ "CS_MODE_RISCV64": CS_MODE_RISCV64,
+ }
+
+ options = {
+ "CS_OPT_SYNTAX_ATT": CS_OPT_SYNTAX_ATT,
+ "CS_OPT_SYNTAX_NOREGNAME": CS_OPT_SYNTAX_NOREGNAME,
+ }
+
+ mc_modes = {
+ ("CS_ARCH_X86", "CS_MODE_32"): ['-triple=i386'],
+ ("CS_ARCH_X86", "CS_MODE_64"): ['-triple=x86_64'],
+ ("CS_ARCH_ARM", "CS_MODE_ARM"): ['-triple=armv7'],
+ ("CS_ARCH_ARM", "CS_MODE_THUMB"): ['-triple=thumbv7'],
+ ("CS_ARCH_ARM", "CS_MODE_ARM+CS_MODE_V8"): ['-triple=armv8'],
+ ("CS_ARCH_ARM", "CS_MODE_THUMB+CS_MODE_V8"): ['-triple=thumbv8'],
+ ("CS_ARCH_ARM", "CS_MODE_THUMB+CS_MODE_MCLASS"): ['-triple=thumbv7m'],
+ ("CS_ARCH_ARM64", "0"): ['-triple=aarch64'],
+ ("CS_ARCH_MIPS", "CS_MODE_MIPS32+CS_MODE_BIG_ENDIAN"): ['-triple=mips'],
+ ("CS_ARCH_MIPS", "CS_MODE_MIPS32+CS_MODE_MICRO"): ['-triple=mipsel', '-mattr=+micromips'],
+ ("CS_ARCH_MIPS", "CS_MODE_MIPS64"): ['-triple=mips64el'],
+ ("CS_ARCH_MIPS", "CS_MODE_MIPS32"): ['-triple=mipsel'],
+ ("CS_ARCH_MIPS", "CS_MODE_MIPS64+CS_MODE_BIG_ENDIAN"): ['-triple=mips64'],
+ ("CS_ARCH_MIPS", "CS_MODE_MIPS32+CS_MODE_MICRO+CS_MODE_BIG_ENDIAN"): ['-triple=mips', '-mattr=+micromips'],
+ ("CS_ARCH_MIPS", "CS_MODE_MIPS32+CS_MODE_BIG_ENDIAN+CS_MODE_MICRO"): ['-triple=mips', '-mattr=+micromips'],
+ ("CS_ARCH_PPC", "CS_MODE_BIG_ENDIAN"): ['-triple=powerpc64'],
+ ('CS_ARCH_SPARC', 'CS_MODE_BIG_ENDIAN'): ['-triple=sparc'],
+ ('CS_ARCH_SPARC', 'CS_MODE_BIG_ENDIAN+CS_MODE_V9'): ['-triple=sparcv9'],
+ ('CS_ARCH_SYSZ', '0'): ['-triple=s390x', '-mcpu=z196'],
+ ('CS_ARCH_RISCV', 'CS_MODE_RISCV32'): ['-triple=riscv32'],
+ ('CS_ARCH_RISCV', 'CS_MODE_RISCV64'): ['-triple=riscv64'],
+ }
+
+ #if not option in ('', 'None'):
+ # print archs[arch], modes[mode], options[option]
+
+ #print(arch, mode, option)
+ md = Cs(archs[arch], modes[mode])
+
+ mc_option = None
+ if arch == 'CS_ARCH_X86':
+ # tell llvm-mc to use Intel syntax
+ mc_option = '-output-asm-variant=1'
+
+ if arch == 'CS_ARCH_ARM' or arch == 'CS_ARCH_PPC' :
+ md.syntax = CS_OPT_SYNTAX_NOREGNAME
+
+ if fname.endswith('3DNow.s.cs'):
+ md.syntax = CS_OPT_SYNTAX_ATT
+
+ for line in lines[1:]:
+ # ignore all the input lines having # in front.
+ if line.startswith('#'):
+ continue
+ #print("Check %s" %line)
+ code = line.split(' = ')[0]
+ asm = ''.join(line.split(' = ')[1:])
+ hex_code = code.replace('0x', '')
+ hex_code = hex_code.replace(',', '')
+ hex_data = hex_code.decode('hex')
+ #hex_bytes = array.array('B', hex_data)
+
+ x = list(md.disasm(hex_data, 0))
+ if len(x) > 0:
+ if x[0].op_str != '':
+ cs_output = "%s %s" %(x[0].mnemonic, x[0].op_str)
+ else:
+ cs_output = x[0].mnemonic
+ else:
+ cs_output = 'FAILED to disassemble'
+
+ cs_output2 = normalize_hex(cs_output)
+ cs_output2 = cs_output2.replace(' ', '')
+
+ if arch == 'CS_ARCH_MIPS':
+ # normalize register alias names
+ cs_output2 = cs_output2.replace('$at', '$1')
+ cs_output2 = cs_output2.replace('$v0', '$2')
+ cs_output2 = cs_output2.replace('$v1', '$3')
+
+ cs_output2 = cs_output2.replace('$a0', '$4')
+ cs_output2 = cs_output2.replace('$a1', '$5')
+ cs_output2 = cs_output2.replace('$a2', '$6')
+ cs_output2 = cs_output2.replace('$a3', '$7')
+
+ cs_output2 = cs_output2.replace('$t0', '$8')
+ cs_output2 = cs_output2.replace('$t1', '$9')
+ cs_output2 = cs_output2.replace('$t2', '$10')
+ cs_output2 = cs_output2.replace('$t3', '$11')
+ cs_output2 = cs_output2.replace('$t4', '$12')
+ cs_output2 = cs_output2.replace('$t5', '$13')
+ cs_output2 = cs_output2.replace('$t6', '$14')
+ cs_output2 = cs_output2.replace('$t7', '$15')
+ cs_output2 = cs_output2.replace('$t8', '$24')
+ cs_output2 = cs_output2.replace('$t9', '$25')
+
+ cs_output2 = cs_output2.replace('$s0', '$16')
+ cs_output2 = cs_output2.replace('$s1', '$17')
+ cs_output2 = cs_output2.replace('$s2', '$18')
+ cs_output2 = cs_output2.replace('$s3', '$19')
+ cs_output2 = cs_output2.replace('$s4', '$20')
+ cs_output2 = cs_output2.replace('$s5', '$21')
+ cs_output2 = cs_output2.replace('$s6', '$22')
+ cs_output2 = cs_output2.replace('$s7', '$23')
+
+ cs_output2 = cs_output2.replace('$k0', '$26')
+ cs_output2 = cs_output2.replace('$k1', '$27')
+
+ #print("Running MC ...")
+ if fname.endswith('thumb-fp-armv8.s.cs'):
+ mc_output = run_mc(archs[arch], code, ['-triple=thumbv8'], mc_option)
+ elif fname.endswith('mips64-alu-instructions.s.cs'):
+ mc_output = run_mc(archs[arch], code, ['-triple=mips64el', '-mcpu=mips64r2'], mc_option)
+ else:
+ mc_output = run_mc(archs[arch], code, mc_modes[(arch, mode)], mc_option)
+ mc_output2 = normalize_hex(mc_output)
+
+ if arch == 'CS_ARCH_MIPS':
+ mc_output2 = mc_output2.replace(' 0(', '(')
+
+ if arch == 'CS_ARCH_PPC':
+ mc_output2 = mc_output2.replace('.+', '')
+ mc_output2 = mc_output2.replace('.', '')
+ mc_output2 = mc_output2.replace(' 0(', '(')
+
+ mc_output2 = mc_output2.replace(' ', '')
+ mc_output2 = mc_output2.replace('opaque', '')
+
+
+ if (cs_output2 != mc_output2):
+ asm = asm.replace(' ', '').strip().lower()
+ if asm != cs_output2:
+ print("Mismatch: %s" %line.strip())
+ print("\tMC = %s" %mc_output)
+ print("\tCS = %s" %cs_output)
+
+
+if __name__ == '__main__':
+ if len(sys.argv) == 1:
+ fnames = sys.stdin.readlines()
+ for fname in fnames:
+ test_file(fname.strip())
+ else:
+ #print("Usage: ./test_mc.py <input-file.s.cs>")
+ test_file(sys.argv[1])
+