REVEN-Axion 2018v1.4.4
ida_xref_simple.py

This is an example of IDA script using the REVEN Python API. Must be run from IDA!

See the script's documentation below for more information:

1 """
2 Purpose:
3  This script is a proof-of-concept to add code cross references for dynamic
4  control flow instructions (whose targets cannot be computed by IDA's static
5  analysis) of a binary in IDA. The cross references are local, namely callees
6  being not in the binary will not be counted.
7 
8 Usage:
9  Use IDA to load the binary, and give the following arguments before
10  executing the script:
11  host = your REVEN server name, and
12  port = your REVEN's project port on this host.
13 
14 Remark:
15  The script takes targets from the execution trace logged by REVEN (i.e. dynamic
16  analysis). It has the following limits:
17  - the name of the binary must be obtained from REVEN (that is usually the
18  case if "dump process" is executed properly), and be case insentively
19  identical with the one analyzed in IDA (i.e. we do not rename the binary)
20 
21  - the binary must be mapped at a unique virtual base address in the REVEN's
22  project trace
23 
24  - the added cross-refs are not complete (as an inherent problem of dynamic
25  analysis)
26 
27  - self-modifying/overlapping instructions are checked in a very limited way
28  (does not assume any coherent result in case of self-modifying/packed code).
29 """
30 
31 import idc
32 import idautils
33 import idaapi
34 
35 import reven
36 
37 
38 ida_dynamic_jump_types = [
39  idaapi.NN_jmp,
40  idaapi.NN_jmpfi,
41  idaapi.NN_jmpni,
42  idaapi.NN_jmpshort
43 ]
44 
45 ida_dynamic_call_types = [
46  idaapi.NN_call,
47  idaapi.NN_callfi,
48  idaapi.NN_callni
49 ]
50 
51 ida_dynamic_ret_types = [
52  idaapi.NN_retn,
53  idaapi.NN_retf
54 ]
55 
56 reven_jcc_mnemonics = ['jmp', 'call', 'retn', 'retf']
57 
58 
59 def main(host, port):
60  project = reven.Project(host, port)
61 
62  runtime_base_address = get_base_address(project)
63  if runtime_base_address is not None:
64  static_base_address = idaapi.get_imagebase()
65 
66  print 'base addresses:'
67  print ' static: 0x{:x}'.format(idaapi.get_imagebase())
68  print ' runtime: 0x{:x}'.format(runtime_base_address)
69 
70  inss = collect_indirect_jccs()
71  trace = get_binary_trace(project)
72  offset = runtime_base_address - static_base_address
73 
74  add_xrefs(inss, trace, offset)
75 
76  print 'adding code xref done.'
77  else:
78  print 'cannot find the binary {:s} in the REVEN\'s trace or it is mapped at different base addresses.'.format(os.path.basename(idc.GetInputFilePath()))
79 
80 
81 def get_base_address(reven_project):
82  """
83  Look for the current IDA's binary name in the REVEN's binary mapping information.
84  If the binary is found, return the base address it is mapped at, else return None.
85  """
86  base_address = None
87  binary_name = os.path.basename(idc.GetInputFilePath()).lower() # The path information is completely irrelevant,
88  # so we will match against the binary name only
89  loaded_binaries = reven_project.binaries()
90  for bin_path in loaded_binaries:
91  if binary_name == os.path.basename(bin_path).lower():
92  bin_mappings = loaded_binaries[bin_path].mappings
93 
94  # Reven stores all the base addresses this binary is mapped at during the trace.
95  # There could be more than one if more than one process uses this binary
96  for address_space in bin_mappings.values():
97  if base_address is None:
98  base_address = address_space.base_address
99  elif base_address != address_space.base_address:
100  # More than one process uses the binary, and at different addresses!
101  # We don't have enough information to pick one.
102  return None
103 
104  return base_address
105 
106 
107 def get_binary_trace(reven_project):
108  """
109  Return a python generator that filters the trace on the current IDA's binary only.
110  """
111  # Select the main trace
112  trace = reven_project.trace('Execution run')
113  # Or in a better way:
114  # trace = project.traces()[0]
115 
116  # Again, the binary's path is irrelevant, match against its name only.
117  binary_name = os.path.basename(idc.GetInputFilePath())
118 
119  # See the documentation for more information, but note that this defaults to a "contains" matching algorithm.
120  # It may therfore include more that the binary we want.
121  reven_points = trace.search_point([reven.BinaryCriterion(pattern=binary_name,
122  case_sensitive=False)])
123  return reven_points
124 
125 
126 def add_xrefs(ida_jc_inss, reven_points, relocation_offset):
127  jcc_dict = {ins.ea: idc.GetManyBytes(ins.ea, ins.size) for ins in ida_jc_inss}
128 
129  bb_control_flow = []
130  examined_control_flow = set()
131  prev_point = None
132 
133  # Let's iterate on the binary's trace and store relevant jumps in bb_control_flow.
134  # While doing so, we will have to filter out edge cases such as the binary jumping out to another.
135  # Also, the generator we declared earlier will return the first point of each sequence only, not all the points, so
136  # getting the information we want requires a bit of fiddling.
137  for curr_point in reven_points:
138  if prev_point is not None:
139  # Look for the last instruction of the previous basic block and
140  # its next instruction
141  target_point = prev_point
142  for last_ins_of_prev_bb in prev_point.basic_block:
143  target_point = target_point.next()
144 
145  # The target point is always the next instruction, but it is not always
146  # in the local trace (e.g. it may be the target of a call to an
147  # external function)
148  curr_ins = curr_point.instruction
149  target_ins = target_point.instruction
150 
151  # We have to check:
152  # 1. the target instruction is in the local trace
153  # 2. the caller is a dynamic control flow instruction
154  # 3. the caller have been discovered by the static analysis
155  # 4. the control flow (caller, callee) is not counted
156  if (curr_ins.address == target_ins.address) and \
157  (last_ins_of_prev_bb.mnemonic in reven_jcc_mnemonics) and \
158  (last_ins_of_prev_bb.address - relocation_offset in jcc_dict) and \
159  ((last_ins_of_prev_bb.address, curr_ins.address) not in examined_control_flow):
160  examined_control_flow.add((last_ins_of_prev_bb.address, curr_ins.address))
161  bb_control_flow.append((last_ins_of_prev_bb, curr_ins))
162 
163  prev_point = curr_point
164 
165  # Now bb_control_flow should contain a list of (caller, callee) extracted from
166  # the REVEN's project trace
167 
168  updated_jmp_xrefs = set()
169  updated_call_xrefs = set()
170  updated_ret_xrefs = set()
171 
172  for (prev_ins, curr_ins) in bb_control_flow:
173  mnemonic = prev_ins.mnemonic
174  caller_address = prev_ins.address - relocation_offset
175 
176  if caller_address in jcc_dict:
177  if jcc_dict[caller_address] != prev_ins.raw_bytes:
178  print 'warning: instruction at 0x{:x} is modified in running or loading time'.format(caller_address)
179 
180  target_address = curr_ins.address - relocation_offset
181  if mnemonic == 'jmp':
182  if (caller_address, target_address) not in updated_jmp_xrefs:
183  idc.AddCodeXref(caller_address, target_address, idc.fl_JF)
184  updated_jmp_xrefs.add((caller_address, target_address))
185  elif mnemonic == 'call':
186  if (caller_address, target_address) not in updated_call_xrefs:
187  idc.AddCodeXref(caller_address, target_address, idc.fl_CF)
188  updated_call_xrefs.add((caller_address, target_address))
189  else: # mnemonic == 'ret'
190  if (caller_address, target_address) not in updated_ret_xrefs:
191  idc.AddCodeXref(caller_address, target_address, idc.fl_JF)
192  updated_ret_xrefs.add((caller_address, target_address))
193 
194  print 'updated xrefs from runtime information:'
195  print ' jmp: {:d}'.format(len(updated_jmp_xrefs))
196  print ' call: {:d}'.format(len(updated_call_xrefs))
197  print ' ret: {:d}'.format(len(updated_ret_xrefs))
198 
199  for (caller_address, target_address) in updated_jmp_xrefs:
200  print '0x{:x} => 0x{:x} (jmp)'.format(caller_address, target_address)
201 
202  for (caller_address, target_address) in updated_call_xrefs:
203  print '0x{:x} => 0x{:x} (call)'.format(caller_address, target_address)
204 
205  for (caller_address, target_address) in updated_ret_xrefs:
206  print '0x{:x} => 0x{:x} (ret)'.format(caller_address, target_address)
207 
208 
209 def collect_indirect_jccs():
210  indirect_calls = []
211  indirect_jumps = []
212  indirect_rets = []
213 
214  for seg in idautils.Segments():
215  for head in idautils.Heads(idc.SegStart(seg), idc.SegEnd(seg)):
216  if idc.isCode(idc.GetFlags(head)):
217  ins = idautils.DecodeInstruction(head)
218  if ins is not None:
219  if ins.get_canon_feature() & idaapi.CF_JUMP:
220  if ins.itype in ida_dynamic_jump_types:
221  if ins not in indirect_jumps:
222  indirect_jumps.append(ins)
223  elif ins.itype in ida_dynamic_call_types:
224  if ins not in indirect_calls:
225  indirect_calls.append(ins)
226  elif ins.itype in ida_dynamic_ret_types:
227  if ins not in indirect_rets:
228  indirect_rets.append(ins)
229 
230  print 'statically detected jccs:'
231  print ' jmp: {:d}'.format(len(indirect_jumps))
232  print ' call: {:d}'.format(len(indirect_calls))
233  print ' ret: {:d}'.format(len(indirect_rets))
234 
235  return indirect_calls + indirect_jumps + indirect_rets
236 
237 
238 if __name__ == '__main__':
239  host_port_str = idc.AskStr('localhost:13370', "REVEN's project address")
240  if host_port_str is not None:
241  try:
242  host, port_str = host_port_str.split(':')
243  port = int(port_str)
244  print("REVEN's project: {}:{}").format(host, port)
245  main(host, port)
246  except ValueError:
247  print("please give a correct REVEN\'s project address, e.g. localhost:13370")
248  except RuntimeError, e:
249  print('{}').format(e)
250  except:
251  print('Unknown error')