dwarf-synthesis/DwarfSynth/Simplest.ml

722 lines
23 KiB
OCaml

open Std
module CFG = BStd.Graphs.Ir
type memory_offset = int64
type memory_address = int64
module AddrMap = Map.Make(Int64)
module AddrSet = Set.Make(Int64)
type cfa_pos =
RspOffset of memory_offset
| RbpOffset of memory_offset
| CfaLostTrack
type rbp_pos =
| RbpUndef
| RbpCfaOffset of memory_offset
type reg_pos = cfa_pos * rbp_pos
type reg_changes_fde = reg_pos AddrMap.t
type subroutine_cfa_data = {
reg_changes_fde: reg_changes_fde;
beg_pos: memory_address;
end_pos: memory_address;
}
type block_local_state = {
rbp_vars: BStd.Var.Set.t;
rbp_pop_set: BStd.Tid.Set.t
}
module StrMap = Map.Make(String)
type subroutine_cfa_map = subroutine_cfa_data StrMap.t
module TIdMap = Map.Make(BStd.Tid)
exception InvalidSub
exception UnexpectedRbpSet
type synthesis_settings = {
mutable no_rbp_undef: bool
}
let __settings = {
no_rbp_undef = false
}
let pp_cfa_pos ppx = function
| RspOffset off -> Format.fprintf ppx "RSP + (%s)" (Int64.to_string off)
| RbpOffset off -> Format.fprintf ppx "RBP + (%s)" (Int64.to_string off)
| CfaLostTrack -> Format.fprintf ppx "??@."
let pp_rbp_pos ppx = function
| RbpUndef -> Format.fprintf ppx "u"
| RbpCfaOffset off -> Format.fprintf ppx "c%+Ld" off
let pp_reg_pos ppx (cfa_pos, rbp_pos) =
Format.fprintf ppx "(%a; %a)" pp_cfa_pos cfa_pos pp_rbp_pos rbp_pos
let pp_int64_hex ppx number =
let mask_short = Int64.(pred (shift_left one 16)) in
let pp_short number =
Format.fprintf ppx "%04x" Int64.(to_int (logand number mask_short))
in
List.iter pp_short @@ List.map (fun x ->
Int64.(shift_right number (16*x))) [3;2;1;0]
let pp_cfa_changes_fde ppx cfa_changes = AddrMap.iter
(fun addr change ->
Format.fprintf ppx "%a: %a@."
pp_int64_hex addr
pp_cfa_pos change)
cfa_changes
let pp_cfa_changes ppx =
StrMap.iter (fun fde_name entry ->
Format.fprintf ppx "%s@\n====@\n@\n%a@." fde_name
pp_cfa_changes_fde entry)
let pp_option_of sub_pp ppx = function
| None -> Format.fprintf ppx "None"
| Some x -> Format.fprintf ppx "Some %a" sub_pp x
let opt_addr_of term =
(** Get the address of a term as an option, if it has one*)
BStd.Term.get_attr term BStd.address
let addr_of term =
(** Get the address of a term *)
match opt_addr_of term with
| None -> assert false
| Some addr -> addr
let opt_addr_of_blk_elt = function
| `Def def -> opt_addr_of def
| `Jmp jmp -> opt_addr_of jmp
| `Phi phi -> opt_addr_of phi
let entrypoint_address blk =
(** Find the first instruction address in the current block.
Return None if no instruction has address. *)
let fold_one accu cur_elt = match accu, opt_addr_of_blk_elt cur_elt with
| None, None -> None
| None, Some x -> Some x
| _, _ -> accu
in
BStd.Seq.fold (BStd.Blk.elts blk)
~init:None
~f:fold_one
let to_int64_addr addr =
BStd.Word.to_int64_exn addr
let int64_addr_of x = to_int64_addr @@ addr_of x
let map_option f = function
| None -> None
| Some x -> Some (f x)
let build_next_instr (disasm: BStd.disasm): AddrSet.t AddrMap.t =
(** Build a map of memory_address -> AddrSet.t holding, for each address, the
set of instructions coming right after the instruction at given address.
There might be multiple such addresses, if the current instruction is at
a point of branching. *)
let rec build_of_instr_list cur_map = function
(** Maps an instruction to its following instruction in this block *)
| (cur_mem, cur_insn) :: ((next_mem, next_insn) as elt2) :: tl ->
(* Its only successor is next_insn *)
let new_map =
(try
let cur_addr = to_int64_addr @@ BStd.Memory.min_addr cur_mem
and next_addr = to_int64_addr @@ BStd.Memory.min_addr next_mem in
AddrMap.add cur_addr (AddrSet.singleton next_addr) cur_map
with _ -> cur_map)
in
build_of_instr_list new_map (elt2 :: tl)
| (cur_mem, _) :: [] ->
let last_addr = (try Some (to_int64_addr @@ BStd.Memory.min_addr cur_mem)
with _ -> None) in
cur_map, last_addr
(* Ignore the last one: its successors are held in the graph *)
| [] ->
cur_map, None
in
let cfg = BStd.Disasm.cfg disasm in
let rec block_addresses block =
(try BStd.Block.addr block
|> to_int64_addr
|> AddrSet.singleton
with _ ->
(* Probably an intermediary node, eg. JMP --> [inermed node] --> BLK *)
let outputs = BStd.Graphs.Cfg.Node.outputs block cfg
|> BStd.Seq.map ~f:BStd.Graphs.Cfg.Edge.dst in
BStd.Seq.fold outputs
~init:AddrSet.empty
~f:(fun accu block -> AddrSet.union (block_addresses block) accu)
)
in
let build_of_block cur_map block =
let cur_map, last_addr =
build_of_instr_list cur_map (BStd.Block.insns block) in
(match last_addr with
| Some last_addr ->
let following_set = BStd.Graphs.Cfg.Node.outputs block cfg
|> BStd.Seq.fold
~init:AddrSet.empty
~f:(fun set edge -> AddrSet.union
(block_addresses
(BStd.Graphs.Cfg.Edge.dst edge))
set)
in
AddrMap.add last_addr following_set cur_map
| None -> cur_map
)
in
BStd.Seq.fold (BStd.Graphs.Cfg.nodes cfg)
~init:AddrMap.empty
~f:build_of_block
let find_rbp_pop_set cfg entry =
(** Returns a BStd.Tid.Set.t of the terms actually "popping" %rbp, that is,
the terms that should trigger a change to RbpUndef of the %rbp register.
The current heuristic is to consider the expressions
i) of the form `rbp = F(mem, rsp)` (alledgedly, rbp = something from the
stack);
ii) that are the last of this kind in the subroutine's CFG (ie. such that
there is not another instruction matching (i) that is reachable through
the CFG from the current instruction).
*)
let def_is_rbp_pop def =
let is_pop_expr expr =
let free_vars = BStd.Exp.free_vars expr in
let free_x86_regs = Regs.X86_64.map_varset free_vars in
(match Regs.DwRegOptSet.cardinal free_x86_regs with
| 2 ->
let reg = free_x86_regs
|> Regs.DwRegOptSet.filter
(fun x -> match x with None -> false | Some _ -> true)
|> Regs.DwRegOptSet.choose in
let has_mem_var = BStd.Var.Set.exists
~f:(fun x -> BStd.Var.name x = "mem")
free_vars in
(match reg, has_mem_var with
| Some dw_var, true when dw_var = Regs.X86_64.rsp -> true
| _ -> false)
| _ -> false
)
in
(match Regs.X86_64.of_var (BStd.Def.lhs def),
is_pop_expr @@ BStd.Def.rhs def with
| Some reg, true when reg = Regs.X86_64.rbp -> true
| _ -> false
)
in
let block_find_rbp_pop block =
let fold_elt = function
| `Def(def) when (def_is_rbp_pop def) -> Some (BStd.Term.tid def)
| _ -> None
in
let elts_seq = BStd.Blk.elts block in
let last_pop = BStd.Seq.fold elts_seq
~init:None
~f:(fun accu elt ->
(match fold_elt elt with
| None -> accu
| Some tid -> Some tid))
in
last_pop
in
let rec block_dfs node visited =
(* DFS on the CFG to find rbp pops, and rule out those that are not final
*)
let block = CFG.Node.label node in
(match BStd.Blk.Set.mem visited block with
| true ->
(* Loop: we already visited this node *)
BStd.Tid.Set.empty, true, visited
| false ->
let visited = BStd.Blk.Set.add visited block in
let pop_set, has_pop, visited =
BStd.Seq.fold (CFG.Node.succs node cfg)
~f:(fun (pre_pop_set, pre_has_pop, visited) child ->
let cur_pop_set, cur_has_pop, visited =
block_dfs child visited in
(BStd.Tid.Set.union pre_pop_set cur_pop_set),
(pre_has_pop || cur_has_pop),
visited
)
~init:(BStd.Tid.Set.empty, false, visited)
in
let pop_set, has_pop = (match has_pop with
| false -> (* No rbp pop below, we seek rbp pops in this block *)
(match block_find_rbp_pop block with
| None -> pop_set, false
| Some tid -> BStd.Tid.Set.add pop_set tid, true
)
| true -> pop_set, has_pop) in
pop_set, has_pop, visited
)
in
if __settings.no_rbp_undef then
BStd.Tid.Set.empty
else (
let pop_set, _, _ =
block_dfs entry (BStd.Blk.Set.empty) in
pop_set
)
let interpret_var_expr c_var offset expr = BStd.Bil.(
let closed_form = BStd.Exp.substitute
(var c_var)
(int (BStd.Word.of_int64 (Int64.neg offset)))
expr
in
let res = BStd.Exp.eval closed_form in
match res with
| Imm value ->
Some (Int64.neg @@ BStd.Word.to_int64_exn @@ BStd.Word.signed value)
| _ -> None
)
let is_single_free_reg expr =
(** Detects whether `expr` contains a single free variable which is a machine
register, and if so, extracts this register and returns a pair `(formal,
reg`) of its formal variable in the expression and the register.
Otherwise return None. *)
let free_vars = BStd.Exp.free_vars expr in
let free_x86_regs = Regs.X86_64.map_varset free_vars in
(match Regs.DwRegOptSet.cardinal free_x86_regs with
| 1 ->
let free_var = Regs.DwRegOptSet.choose free_x86_regs in
(match free_var with
| Some dw_var ->
let bil_var = (match BStd.Var.Set.choose free_vars with
| None -> assert false
| Some x -> x) in
Some (bil_var, dw_var)
| _ -> None
)
| _ -> None
)
let process_def (local_state: block_local_state) def (cur_reg: reg_pos)
: (reg_pos option * block_local_state) =
let lose_track = Some CfaLostTrack in
let cur_cfa, cur_rbp = cur_reg in
let out_cfa =
(match cur_cfa, Regs.X86_64.of_var (BStd.Def.lhs def) with
| RspOffset(cur_offset), Some reg when reg = Regs.X86_64.rsp ->
let exp = BStd.Def.rhs def in
(match is_single_free_reg exp with
| Some (bil_var, dw_var) when dw_var = Regs.X86_64.rsp ->
let interpreted = interpret_var_expr bil_var cur_offset exp in
(match interpreted with
| None -> lose_track
| Some new_offset ->
Some (RspOffset(new_offset))
)
| _ -> lose_track
)
| RspOffset(cur_offset), Some reg when reg = Regs.X86_64.rbp ->
(* We have CFA=rsp+k and a line %rbp <- [expr].
Might be a %rbp <- %rsp *)
let exp = BStd.Def.rhs def in
(match is_single_free_reg exp with
| Some (bil_var, dw_var) when dw_var = Regs.X86_64.rsp ->
(* We have %rbp := F(%rsp) *)
(* FIXME we wish to have %rbp := %rsp. An ugly and non-robust test to
check that would be interpret F(0), expecting that F is at worst
affine - then a restult of 0 means that %rbp := %rsp + 0 *)
let interpreted = interpret_var_expr bil_var (Int64.zero) exp in
(match interpreted with
| Some offset when offset = Int64.zero ->
Some (RbpOffset(cur_offset))
| _ ->
(* Previous instruction was rsp-indexed, here we put something
weird in %rbp, let's keep indexing with rsp and do nothing *)
None
)
| _ -> None
)
| RbpOffset(cur_offset), Some reg when reg = Regs.X86_64.rbp ->
(* Assume we are overwriting %rbp with something — we must revert to
some rsp-based indexing *)
(* FIXME don't assume the rsp offset will always be 8, find a smart way
to figure this out.
We actually use offset 16 because the `pop` will occur after the
value is read from the stack.
*)
Some (RspOffset(Int64.of_int 16))
| _ -> None
) in
let is_rbp_save_expr expr local_state =
let free_vars = BStd.Exp.free_vars expr in
let card = BStd.Var.Set.length free_vars in
let has_mem_var = BStd.Var.Set.exists
~f:(fun x -> BStd.Var.name x = "mem")
free_vars in
let free_x86_regs = Regs.X86_64.map_varset free_vars in
let has_rsp_var = free_x86_regs
|> Regs.DwRegOptSet.exists
(fun x -> match x with
| Some x when x = Regs.X86_64.rsp -> true
| _ -> false) in
let has_rbp_var = free_x86_regs
|> Regs.DwRegOptSet.exists
(fun x -> match x with
| Some x when x = Regs.X86_64.rbp -> true
| _ -> false) in
let has_intermed_rbp_var = free_vars
|> BStd.Var.Set.inter local_state.rbp_vars
|> BStd.Var.Set.is_empty
|> not in
(card = 3 && has_mem_var && has_rsp_var &&
(has_rbp_var || has_intermed_rbp_var))
in
let is_rbp_expr expr =
let free_vars = BStd.Exp.free_vars expr in
let free_x86_regs = Regs.X86_64.map_varset free_vars in
(match Regs.DwRegOptSet.cardinal free_x86_regs with
| 1 ->
let reg = Regs.DwRegOptSet.choose free_x86_regs in
(match reg with
| Some dwreg when dwreg = Regs.X86_64.rbp -> true
| _ -> false)
| _ -> false)
in
let gather_rbp_intermed_var def cur_state =
(* If `def` is `some intermed. var <- rbp`, add this information in the
local state *)
(match is_rbp_expr @@ BStd.Def.rhs def with
| true ->
let lhs_var = BStd.Def.lhs def in
if (BStd.Var.is_virtual lhs_var
&& BStd.Var.typ lhs_var = BStd.reg64_t) then
(
(* This `def` is actually of the type we want to store. *)
let n_rbp_vars = BStd.Var.Set.add cur_state.rbp_vars lhs_var in
{ cur_state with rbp_vars = n_rbp_vars }
)
else
cur_state
| false -> cur_state
)
in
let out_rbp, new_state =
(match cur_rbp with
| RbpUndef ->
let cur_state = gather_rbp_intermed_var def local_state in
(* We assume that an expression is saving %rbp on the stack at the
address %rip when the current def is an expression of the kind
`MEM <- F(MEM, %rip, v)` where `v` is either `%rbp` or some
intermediary variable holding `%rbp`.
This approach is sound when %rbp is saved using a `push`, but
probably wrong when saved using a `mov` on some stack-space allocated
previously (eg. for multiple registers saved at once).
It would be far better to actually read the position at which `v` is
saved, but this requires parsing the actual rhs expression, which is
not easily done: FIXME
*)
let new_rbp =
if (BStd.Var.name @@ BStd.Def.lhs def = "mem"
&& is_rbp_save_expr (BStd.Def.rhs def) cur_state)
then
(match cur_cfa with
| RspOffset off ->
Some (RbpCfaOffset (Int64.mul Int64.minus_one off))
| _ -> raise UnexpectedRbpSet
)
else
None
in
new_rbp, cur_state
| RbpCfaOffset offs ->
(* We go back to RbpUndef if the current def is in the rbp_pop_set --
see `find_rbp_pop_set` *)
(match BStd.Tid.Set.mem (local_state.rbp_pop_set) @@ BStd.Term.tid def
with
| true ->
Some RbpUndef, local_state
| false -> None, local_state
)
)
in
(match out_cfa, out_rbp with
| None, None -> None
| Some cfa, None -> Some (cfa, cur_rbp)
| None, Some rbp -> Some (cur_cfa, rbp)
| Some cfa, Some rbp -> Some (cfa, rbp)),
new_state
let process_jmp jmp (cur_reg: reg_pos)
: (reg_pos option) =
let cur_cfa, cur_rbp = cur_reg in
let gen_change = match cur_cfa with
| RspOffset cur_offset -> (fun off ->
let new_offset = Int64.add cur_offset (Int64.of_int off) in
Some (RspOffset(new_offset), cur_rbp)
)
| _ -> (fun _ -> None)
in
match (BStd.Jmp.kind jmp) with
| BStd.Call call -> gen_change (-8)
| BStd.Ret ret -> gen_change (8)
| _ -> None
let process_blk
next_instr_graph rbp_pop_set (block_init: reg_pos) blk
: (reg_changes_fde * reg_pos) =
(** Extracts the registers (CFA+RBP) changes of a block. *)
let apply_offset cur_addr_opt ((accu:reg_changes_fde), cur_reg, local_state)
= function
| None -> (accu, cur_reg, local_state)
| Some reg_pos ->
let cur_addr = (match cur_addr_opt with
| None -> assert false
| Some x -> to_int64_addr x) in
(AddrSet.fold (fun n_addr cur_accu ->
AddrMap.add n_addr reg_pos cur_accu)
(AddrMap.find cur_addr next_instr_graph)
accu),
reg_pos,
local_state
in
let fold_elt (accu, cur_reg, cur_local_state) elt = match elt with
| `Def(def) ->
let new_offset, new_state = process_def cur_local_state def cur_reg in
apply_offset
(opt_addr_of def) (accu, cur_reg, new_state) new_offset
| `Jmp(jmp) ->
apply_offset
(opt_addr_of jmp) (accu, cur_reg, cur_local_state)
@@ process_jmp jmp cur_reg
| _ -> (accu, cur_reg, cur_local_state)
in
let init_changes = (match opt_addr_of blk with
| None -> AddrMap.empty
| Some x ->
let blk_address = to_int64_addr x in
AddrMap.singleton blk_address block_init
) in
let empty_local_state = {
rbp_vars = BStd.Var.Set.empty;
rbp_pop_set = rbp_pop_set
} in
let elts_seq = BStd.Blk.elts blk in
let out_reg, end_reg, _ = BStd.Seq.fold elts_seq
~init:(init_changes, block_init, empty_local_state)
~f:fold_elt in
out_reg, end_reg
exception Inconsistent of BStd.tid
let get_entry_blk graph =
let entry = BStd.Seq.min_elt (CFG.nodes graph) ~cmp:(fun x y ->
let ax = opt_addr_of @@ CFG.Node.label x
and ay = opt_addr_of @@ CFG.Node.label y in
match ax, ay with
| None, None -> compare x y
| Some _, None -> -1
| None, Some _ -> 1
| Some ax, Some ay -> compare (to_int64_addr ax) (to_int64_addr ay))
in
match entry with
| None -> assert false
| Some x -> x
let find_last_addr sub =
(** Finds the maximal instruction address in a subroutine *)
let map_opt fl fr merge l r = match l, r with
| None, None -> None
| Some x, None -> Some (fl x)
| None, Some y -> Some (fr y)
| Some x, Some y -> Some (merge (fl x) (fr y))
in
let max_opt_addr_word = map_opt
(fun x -> x)
(fun y -> to_int64_addr y)
max
in
let max_opt_addr = map_opt
(fun x -> x)
(fun y -> y)
max
in
let max_def cur_max def =
max_opt_addr_word cur_max (opt_addr_of def)
in
let fold_res =
BStd.Seq.fold (BStd.Term.enum BStd.blk_t sub)
~init:None
~f:(fun cur_max blk ->
max_opt_addr
(BStd.Seq.fold (BStd.Term.enum BStd.def_t blk)
~init:cur_max
~f:max_def)
(BStd.Seq.fold (BStd.Term.enum BStd.jmp_t blk)
~init:cur_max
~f:max_def)
)
in
match fold_res with
| None -> Int64.zero
| Some x -> x
let cleanup_fde (fde_changes: reg_changes_fde) : reg_changes_fde =
(** Cleanup the result of `of_sub`.
Merges entries at the same address, propagates track lost *)
let fold_one addr reg_change (accu, last_change, lost_track) =
match reg_change, last_change, lost_track with
| _, _, true -> (accu, None, lost_track)
| (CfaLostTrack, _), _, false ->
(AddrMap.add addr reg_change accu, None, true)
| reg_change, Some prev_change, false when reg_change = prev_change ->
(accu, last_change, false)
| reg_change, _, false ->
(AddrMap.add addr reg_change accu, Some reg_change, false)
in
match AddrMap.fold fold_one fde_changes (AddrMap.empty, None, false) with
| out, _, _ -> out
let process_sub sub next_instr_graph : subroutine_cfa_data =
(** Extracts the `cfa_changes_fde` of a subroutine *)
let cfg = BStd.Sub.to_cfg sub in
let first_addr = int64_addr_of sub in
let last_addr = find_last_addr sub in
let initial_cfa_rsp_offset = Int64.of_int 8 in
let entry_blk = get_entry_blk cfg in
let rbp_pop_set = find_rbp_pop_set cfg entry_blk in
let rec dfs_process
(sub_changes: (reg_changes_fde * reg_pos) TIdMap.t)
node
(entry_offset: reg_pos) =
(** Processes one block *)
let cur_blk = CFG.Node.label node in
let tid = BStd.Term.tid @@ cur_blk in
match (TIdMap.find_opt tid sub_changes) with
| None ->
(* Not yet visited: compute the changes *)
let cur_blk_changes, end_reg =
process_blk next_instr_graph rbp_pop_set entry_offset cur_blk in
let n_sub_changes =
TIdMap.add tid (cur_blk_changes, entry_offset) sub_changes in
BStd.Seq.fold (CFG.Node.succs node cfg)
~f:(fun accu child -> dfs_process accu child end_reg)
~init:n_sub_changes
| Some (_, former_entry_offset) ->
(* Already visited: check that entry values are matching *)
if entry_offset <> former_entry_offset then
( Format.eprintf "Found inconsistency (0x%Lx): %a -- %a@."
(int64_addr_of cur_blk)
pp_reg_pos entry_offset pp_reg_pos former_entry_offset
;
raise (Inconsistent tid) )
else
sub_changes
in
let initial_offset = (RspOffset initial_cfa_rsp_offset, RbpUndef) in
let changes_map = dfs_process TIdMap.empty entry_blk initial_offset in
let merged_changes = TIdMap.fold
(fun _ (cfa_changes, _) accu -> AddrMap.union (fun _ v1 v2 ->
if v1 = v2 then
Some v1
else
assert false)
cfa_changes accu)
changes_map
AddrMap.empty in
let reg_changes = cleanup_fde merged_changes in
let output = {
reg_changes_fde = reg_changes ;
beg_pos = first_addr ;
end_pos = last_addr ;
} in
output
let of_prog prog next_instr_graph : subroutine_cfa_map =
(** Extracts the `cfa_changes` of a program *)
let fold_step accu sub =
(try
let subroutine_data = process_sub sub next_instr_graph in
StrMap.add (BStd.Sub.name sub) subroutine_data accu
with
| InvalidSub -> accu
| Inconsistent tid ->
Format.eprintf "Inconsistent TId %a in subroutine %s, skipping.@."
BStd.Tid.pp tid (BStd.Sub.name sub);
accu
)
in
let subroutines = BStd.Term.enum BStd.sub_t prog in
BStd.Seq.fold subroutines
~init:StrMap.empty
~f:fold_step
let of_proj no_rbp_undef proj : subroutine_cfa_map =
(** Extracts the `cfa_changes` of a project *)
__settings.no_rbp_undef <- no_rbp_undef ;
let next_instr_graph = build_next_instr (BStd.Project.disasm proj) in
let prog = BStd.Project.program proj in
of_prog prog next_instr_graph
let clean_lost_track_subs pre_dwarf : subroutine_cfa_map =
(** Removes the subroutines on which we lost track from [pre_dwarf] *)
let sub_lost_track sub_name (sub: subroutine_cfa_data) =
not @@ AddrMap.exists (fun addr pos ->
let cfa_pos, _ = pos in
(match cfa_pos with
| RspOffset _ | RbpOffset _ -> false
| CfaLostTrack -> true))
sub.reg_changes_fde
in
StrMap.filter sub_lost_track pre_dwarf