diff --git a/Dockerfile b/Dockerfile index 40f4962..e473123 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,20 +2,11 @@ FROM rizin/rizin:latest USER root RUN apt-get update && apt-get install --yes \ - build-essential \ - git \ python3-setuptools \ python3-sqlalchemy \ && apt-get clean -RUN git clone -b next https://github.com/capstone-engine/capstone.git /capstone/ -WORKDIR /capstone/ -RUN sh /capstone/make.sh -RUN sh /capstone/make.sh install - COPY setup.py /app/ COPY subdisassem /app/subdisassem/ WORKDIR /app/ RUN python3 setup.py install - -USER rizin diff --git a/setup.py b/setup.py index 6c110c3..b7a2d2f 100644 --- a/setup.py +++ b/setup.py @@ -12,6 +12,7 @@ setup( python_requires=">3", install_requires=[ "capstone", + "rzpipe", "SQLAlchemy", ], ) diff --git a/subdisassem/capstone_wrapper.py b/subdisassem/capstone_wrapper.py index 639813f..614ad74 100644 --- a/subdisassem/capstone_wrapper.py +++ b/subdisassem/capstone_wrapper.py @@ -31,7 +31,6 @@ import logging class _CapstoneBase: def __init__(self, payload: bytes, offset: int = 0): - self.arch = self.__class__.__name__ self.capstone.skipdata = True self.capstone.skipdata_setup = ("unknown", None, None) diff --git a/subdisassem/rizin_wrapper.py b/subdisassem/rizin_wrapper.py new file mode 100644 index 0000000..af14708 --- /dev/null +++ b/subdisassem/rizin_wrapper.py @@ -0,0 +1,114 @@ +from pathlib import Path +import logging +import rzpipe +import json + + +class _RizinBase: + def __init__(self, path: Path): + rz_pipe = rzpipe.open(str(path.absolute())) + + for cmd in self.arch_cmds: + rz_pipe.cmd(cmd) + + rz_pipe.cmd("aa") + result = rz_pipe.cmd("pdj") + rz_pipe.quit() + self.disassembly = json.loads(result) + + def __repr__(self) -> str: + return self.objdump + + def __len__(self) -> int: + return len(self.disassembly) + + def __lt__(self, other): + return len(self) < len(other) + + @property + def objdump(self) -> str: + if hasattr(self, "_objdump"): + return self._objdump + + self._objdump = str() + + for each in self.disassembly: + offset = each.get("offset") + opcode = each.get("opcode") + self._objdump += f"{offset:#02x}:\t{opcode}\n" + + return self._objdump + + @property + def disasm(self) -> list: + if hasattr(self, "_disasm"): + return self._disasm + + self._disasm = list() + + for each in self.disassembly: + offset = each.get("offset") + opcode = each.get("opcode") + + if opcode: + mnemonic = opcode.split(" ")[0] + opcode = opcode.split(" ")[1:] + else: + mnemonic = None + + self._disasm.append([offset, mnemonic, opcode]) + + return self._disasm + + @property + def rets(self) -> list: + if hasattr(self, "_rets"): + return self._rets + + self._rets = list() + + for each in self.disasm: + _, mnemonic, _ = each + + if mnemonic and "ret" in mnemonic: + self._rets.append(mnemonic) + + return self._rets + + @property + def ret_rates(self) -> list: + rates = dict() + + for mnemonic in set(self.rets): + rates[mnemonic] = self.rets.count(mnemonic) + + _ret_rates = sorted( + ((value, key) for (key, value) in rates.items()), reverse=True + ) + + return _ret_rates + + @property + def mnemonic_rates(self) -> list: + mnemonics = list() + + for each in self.disasm: + _, mnemonic, _ = each + + if mnemonic: + mnemonics.append(mnemonic) + + rates = dict() + + for mnemonic in set(mnemonics): + rates[mnemonic] = mnemonics.count(mnemonic) + + _mnemonic_rates = sorted( + ((value, key) for (key, value) in rates.items()), reverse=True + ) + + return _mnemonic_rates + + +class x86_16(_RizinBase): + arch_cmds = ["e asm.arch=x86", "e asm.bits=16"] diff --git a/subdisassem/scripts.py b/subdisassem/scripts.py index f3aac5a..8e8b1bb 100644 --- a/subdisassem/scripts.py +++ b/subdisassem/scripts.py @@ -5,6 +5,7 @@ from sqlalchemy import desc import logging from . import capstone_wrapper +from . import rizin_wrapper from .schema import db_config, Disassembly @@ -21,9 +22,11 @@ def subdisassem_script(): args.bin_path = Path(args.bin_path) if args.verbose: + print_count = -1 level = logging.DEBUG format = "%(asctime)s %(filename)s:%(lineno)d %(message)s" else: + print_count = 5 level = logging.INFO format = "%(asctime)s %(message)s" @@ -58,7 +61,7 @@ def subdisassem_script(): logging.info(f"sha1sum: {checksum}") - archs = [ + capstone_archs = [ capstone_wrapper.x86_16, capstone_wrapper.x86_32, capstone_wrapper.x86_64, @@ -74,13 +77,14 @@ def subdisassem_script(): capstone_wrapper.xcore, ] - for arch in archs: + for arch in capstone_archs: for offset in range(args.fuzz): exists = ( session.query(Disassembly) .filter(Disassembly.checksum == checksum) .filter(Disassembly.offset == offset) .filter(Disassembly.arch == arch.__name__) + .filter(Disassembly.engine == str(arch.__bases__)) .first() ) @@ -92,15 +96,52 @@ def subdisassem_script(): disasembler = arch(payload=raw_bytes, offset=offset) row = Disassembly() - row.arch = disasembler.arch + row.arch = disasembler.__class__.__name__ row.checksum = checksum row.count = len(disasembler) row.engine = str(arch.__bases__) - row.mnemonic_rates = str(disasembler.mnemonic_rates[:5]) + row.mnemonic_rates = str(disasembler.mnemonic_rates[:print_count]) row.offset = offset row.opcodes = disasembler.objdump row.path = str(args.bin_path.absolute()) - row.ret_rates = str(disasembler.ret_rates[:5]) + row.ret_rates = str(disasembler.ret_rates[:print_count]) + row.size = len(raw_bytes) - offset + session.add(row) + + session.commit() + + rizin_archs = [ + rizin_wrapper.x86_16, + ] + + for arch in rizin_archs: + for offset in range(args.fuzz): + exists = ( + session.query(Disassembly) + .filter(Disassembly.checksum == checksum) + .filter(Disassembly.offset == offset) + .filter(Disassembly.arch == arch.__name__) + .filter(Disassembly.engine == str(arch.__bases__)) + .first() + ) + + if exists: + logging.debug( + f"subdiassembly_exists: {[arch.__name__, checksum, offset]}" + ) + continue + + disasembler = arch(path=args.bin_path) + row = Disassembly() + row.arch = disasembler.__class__.__name__ + row.checksum = checksum + row.count = len(disasembler) + row.engine = str(arch.__bases__) + row.mnemonic_rates = str(disasembler.mnemonic_rates[:print_count]) + row.offset = offset + row.opcodes = disasembler.objdump + row.path = str(args.bin_path.absolute()) + row.ret_rates = str(disasembler.ret_rates[:print_count]) row.size = len(raw_bytes) - offset session.add(row) @@ -108,7 +149,16 @@ def subdisassem_script(): tops = list() - for arch in archs: + for arch in capstone_archs: + top = ( + session.query(Disassembly) + .filter(Disassembly.arch == arch.__name__) + .order_by(desc("count")) + .first() + ) + tops.append(top) + + for arch in rizin_archs: top = ( session.query(Disassembly) .filter(Disassembly.arch == arch.__name__) @@ -119,5 +169,5 @@ def subdisassem_script(): tops = sorted(tops, key=len, reverse=True) - for top in tops[:5]: + for top in tops[:print_count]: logging.info(top)