apparmor/utils/binary_analyze
Steve Beattie 6d3e74907d Import the rest of the core functionality of the internal apparmor
development tree (trunk branch). From svn repo version 6381.
2006-04-11 21:52:54 +00:00

1274 lines
34 KiB
Perl
Executable file

#!/usr/bin/perl
#****************************************************************************
#* Copyright (c) Andrew Gross 2003-2004
#* All Rights Reserved
#*
#* The following information and material is confidential and proprietary
#* information of Andrew Gross (the "Confidential Material") and
#* is protected by copyright, patent, trade secrets and other intellectual
#* and property laws. Access to this Confidential Material is limited to
#* authorized employees and/or licensees. Any unauthorized use of the
#* Confidential Material could subject the user to criminal and/or civil
#* penalties.
#*
#* This work is intended for Immunix, Inc. (the "Client") on a
#* work-for-hire basis pending contract completion and payment whereupon
#* all rights confer to the Client.
#*
#****************************************************************************
# NB: This script works only on dynamically linked programs. Static
# programs require a similar but different approach.
use Data::Dumper;
use strict;
my ( $DEBUG, # debug printing control
$addr, $byte, $data, $flag, $i, $t2, $t3, # counters and temps
$first, $first_addr, $k, $last_cv, $last_push,
$subno, $drop_push_ebp, $reloc_flag, $name,
$cf, $rod, $rw, $sstr, $str, $tmp, $x, $y,
%DATA, %FNS, %RELOC, %RODATA, %SEC, %PLT, # data store hashes
%Branches,
%obj_list, %obj_ref, @o_list, # for object file info
@line, $cv, # text seg. loop info
%Regs, %Seta, @pargs, @p2args,
$NO_MATCH_DELIVERED_DEMOS, # turn off bug fixes to match delivered data
);
# Comments:
#
# subroutine args: 0x8(%ebp) [argc], 0xc(%ebp) [argv]
#
# add'l instruction support / invalidate registers when necessary
#
# ACL generation and mapping:
# bit of library stuff as example
# +open +sprintf +chdir +creat +mkdir +opendir +rename
#
# rarely a routine will save extra registers at the beginning
# and this hoses the first function call if push based args.
# Debug printing setting. 0 is default, 5 is highest (most verbose)
#
#$DEBUG=5;
$DEBUG=0;
# Flag to disable improved functionality and bug fixes so that runs will
# match demos delivered to the Client.
#
#$NO_MATCH_DELIVERED_DEMOS=0;
$NO_MATCH_DELIVERED_DEMOS=1;
#=========================================================================
#
# All supporting data is read in, parsed, and stored in this portion of the code.
#
# NB: In reading data all addresses are converted from hex and stored in decimal.
#=== Get sections and address ranges
# Read output from "objdump -x":
# section start and end addresses are needed later so we can
# decide how to translate addresses
$flag=0;
print "reading header...\n";
open(F, "$ARGV[0]/header") || die $!;
while (<F>) {
$flag=1 if /^Sections:$/; # when we've passed the header info
next if /^Sections:$/;
next if !$flag;
# ignore the junk lines
next if /ALLOC|CONTENTS|LOAD|READONLY|CODE|DATA/;
next if /^Idx/;
last if /^SYMBOL TABLE:/;
# extract section start address, end address, and length:
#> 0 .interp 00000013 080480f4 080480f4 000000f4 2**0
split;
$t2 = hex ("0x".$_[2]);
$t3 = hex ("0x".$_[3]);
$SEC{$_[1]} = [ $t3, $t3+$t2, $t2 ]; # name = start, end, len
}
close(F);
# Print the loaded section information
#
if ($DEBUG>4) {
print "\n";
foreach $i (sort keys %SEC) {
printf "%-20s\t%08x\t%08x\t%08x\n", $i, @{ $SEC{$i} };
}
}
# Print the loaded section information sorted by start address (for reference)
#
if ($DEBUG>4) {
print "\n";
foreach $i (sort { ${ $SEC{$a} }[0] <=> ${ $SEC{$b} }[0] } keys %SEC) {
printf "%-20s\t%08x\t%08x\t%08x\n", $i, @{ $SEC{$i} };
}
}
#=== Get symbol names and addresses (dynamic symbol table)
# Read output from "objdump -T":
# dynamic symbol information for xlating calls
#
print "reading dyn_syms...\n";
open(F, "$ARGV[0]/dyn_syms") || die $!;
while (<F>) {
# ignore junk lines
next if /^$/;
next if /file format elf32-i386/;
next if /^DYNAMIC SYMBOL TABLE:/;
# extract the address and dynamic symbol name (library functions typically)
#> 08049260 DF *UND* 0000003a GLIBC_2.0 mkdir
chomp;
split;
$_[0] =~ s/^0+//;
next if $_[-1] eq "_start" && !$NO_MATCH_DELIVERED_DEMOS; # This bug fix throws off sub names
$FNS{hex("0x".$_[0])} = $_[-1];
}
close(F);
# Print the loaded symbol information sorted by symbol name
#
if ($DEBUG>4) {
print "\n";
foreach $i (sort keys %FNS) {
printf "%08x\t%-20s\n", $i, $FNS{$i};
}
}
#=== Get symbol names and addresses (linker RR info)
# Read output from "objdump -R":
# dynamic relocation information also for xlating calls
#
print "reading dynam...\n";
open(F, "$ARGV[0]/dynam") || die $!;
while (<F>) {
# ignore junk lines
next if /^$/;
next if /file format elf32-i386/;
next if /^DYNAMIC SYMBOL TABLE:/;
next if /^OFFSET/;
# extract the address and symbol name
#> 0804f260 R_386_JUMP_SLOT mkdir
chomp;
split;
$_[0] =~ s/^0+//;
$FNS{hex("0x".$_[0])} = $_[-1];
}
close(F);
# Print the loaded relocation information sorted by symbol name
#
if ($DEBUG>4) {
print "\n";
foreach $i (sort keys %FNS) {
printf "%08x\t%-20s\n", $i, $FNS{$i};
}
}
#=== Get read only data (rodata)
# Read output from "objdump -s --section=.rodata":
# read only text data (string and other constants)
#
print "reading rodata...\n";
open(F, "$ARGV[0]/rodata") || die $!;
while (<F>) {
# ignore junk lines
next if /^$/;
next if /file format elf32-i386/;
next if /^Contents of section/;
# extract bytes and store with corresponding address key
# in the %RODATA hash
#> 804d9e0 03000000 01000200 756e6162 6c652074 ........unable t
split;
$addr=$_[0];
$addr = hex("0x".$addr);
shift @_;
$data=join(' ', @_);
$data =~ s/ //g;
while ( length($data) ) {
$byte=substr($data,0,2);
$byte = hex("0x".$byte);
$RODATA{$addr}=$byte;
$data=substr($data,2);
$addr++;
}
}
close(F);
# Print the loaded string data
#
if ($DEBUG>4) {
print "\n";
foreach $i (sort keys %RODATA) {
printf "%08x\t%02x\n", $i, $RODATA{$i};
}
}
print Data::Dumper->Dump([\%RODATA]);
#=== Get data (data)
# Read output from "objdump -s --section=.data":
# data section (for variable tracking and the odd string)
#
print "reading data...\n";
open(F, "$ARGV[0]/data") || die $!;
while (<F>) {
# ignore junk lines
next if /^$/;
next if /file format elf32-i386/;
next if /^Contents of section/;
# identical to RODATA section but stores in %DATA hash
#> 804f000 00000000 00000000 4cf20408 19000000 ........L.......
$addr=substr($_,0,8);
$addr =~ s/ //g;
$addr = hex("0x".$addr);
$data=substr($_,9,35);
$data =~ s/ //g;
while ( length($data) ) {
$byte=substr($data,0,2);
$byte = hex("0x".$byte);
$DATA{$addr}=$byte;
$data=substr($data,2);
$addr++;
}
}
close(F);
# Print the loaded data
#
if ($DEBUG>4) {
print "\n";
foreach $i (sort keys %DATA) {
printf "%08x\t%02x\n", $i, $DATA{$i};
}
}
#=== Get reloc symbol names and addresses
# Read output from "objdump -r":
# read in relocation information which contains
# hints as to in which section a datum is stored
#
# Note: this is only for handling object files
# as executables will not have data in this
# section of the file.
#
print "reading reloc...\n";
open(F, "$ARGV[0]/reloc") || die $!;
while (<F>) {
# ignore junk lines
next if /^$/;
next if /file format elf32-i386/;
next if /^RELOCATION RECORDS/;
next if /^OFFSET/;
chomp;
split;
# skip BSS symbols
next if $_[2] eq ".bss";
#
#> 00000098 R_386_32 .rodata
# if a symbol (32 bit relocation), store with other symbols
# else store with relocation symbols
#
if ( $_[1] eq "R_386_PC32" ) {
$FNS{hex("0x".$_[0])} = $_[-1];
} else {
$RELOC{hex("0x".$_[0])} = $_[-1];
}
}
close(F);
# Print the loaded relocation symbols
#
if ($DEBUG>4) {
print "\n";
foreach $i (sort {$a <=> $b} keys %RELOC) {
printf "%08x\t%-20s\n", $i, $RELOC{$i};
}
}
# Note: data from the "objdump -t" is not currently used. This information
# would only exist in an unstripped binary and is only of help
# to a human reading the code. It only contains symbols internal
# to the program and not of interest for library calls.
# We can crib this from the text directly.
#=========================================================================
#
# This portion of the code reads in the disassembly of the text segment.
# The processing is done in two phases which correspond to the
# two loops (the first split into two pieces) each of which makes
# a complete pass of the text segment. The text segment data is
# not stored in core due possible large size.
#
# The first pass pulls the PLT data as the newer compiler uses a
# different relocation scheme; then the subroutines are found and named,
# main() is located, and branch target addresses are found.
#
# The second pass does all of the bookkeeping required to generate
# coherent subroutine call representations.
# First pass of text segment, part 1:
# Locate .plt section and pull jump information
# which is required to xlate external library
# calls under the newer compiler.
$flag=0;
print "reading text...\n";
open(F, "$ARGV[0]/text") || die $!;
while ($i=<F>) {
# Skip lines until we find "<.plt>:", then
# reset state (exit loop in this case) when
# we finish that section (blank line).
# Except for the last and section names
# this is the same as for the other loops.
#
if ( $i =~ /^$/ || $i =~ /^Disassembly/ ) {
last if $flag;
$flag=0;
next;
} elsif ( $i =~ /<.plt>:/ ) {
$flag=1;
next;
} elsif ( !$flag ) {
next;
}
# We are looking for lines like this:
#> 804a58c: ff 25 74 51 0e 08 jmp *0x80e5174
#
# Library calls will be to this address and the destination
# address will have to be dereferenced via the %FNS hash
# removed unneeded trailer, split, and dump null 1st element
# set $cv to current line address
#
chomp($i);
$i =~ s/ <[\w+]+>$//;
@line=split(/\s+/, $i);
shift @line;
$line[0] =~ s/:$//;
$cv = hex("0x".$line[0]);
# $cv == current value == address of current line
#
$line[0] =~ s/:$//;
$cv = hex("0x".$line[0]);
# if we have a jmp *addr line, enter the info into %PLT
#
if ( $line[-2] eq "jmp" && $line[-1] =~ /^\*/ ) {
$line[-1] =~ s/^\*//;
$PLT{$cv} = hex($line[-1]);
}
}
# First pass of text segment, part 2:
# In which subroutines are found and named,
# and branch targets are identified.
$first_addr=0; # first address of non-system code
$first=1; # flag to make sure first subroutine is named
$last_cv=-1; # address of the previous line (for object files)
$subno="00"; # subroutine name counter
$flag=0;
while ($i=<F>) {
# Skip lines until we reach the beginning of the
# actual "text" (either .text or an internal
# subroutine name.
#
if ( $i =~ /^$/ || $i =~ /^Disassembly/ ) {
$flag=0;
next;
} elsif ( $i =~ /<.text>:/ || $i =~ /<(\w+)>:/ ) {
$name = $1;
$flag=1;
next;
} elsif ( !$flag ) {
next;
}
print ">>> ",$i,"\n" if $DEBUG>3;
# Clean up line as above and set $cv to current address
#
chomp($i);
$i =~ s/ <[\w+]+>$//;
@line=split(/\s+/, $i);
shift @line;
$line[0] =~ s/:$//;
$cv = hex("0x".$line[0]);
# set $first_addr only once
#
$first_addr=$cv if !$first_addr;
# For object files we need to know if there is a relocation
# entry that refers to the previous line of assembly. If so,
# note that for later xlation in loop #2.
#
foreach $k (sort {$a <=> $b} keys %RELOC) {
next if $k < $last_cv;
last if $k > $cv;
$obj_list{$last_cv}++;
$obj_ref {$last_cv}=$RELOC{$k};
}
# Main logic of pass 1. Find start of subroutines, identify
# address of main(), and branch target addresses.
#
if ( $first ) {
# Special case for first line of code so that it is
# always flagged as a subroutine. Clear first flag
# and enter address with subroutine name.
#
$first=0;
printf "START %08x\n", $cv if $DEBUG>4;
if ( !length($FNS{$cv}) ) {
$FNS{$cv} = "sub_".$subno;
$subno++;
}
} elsif ( $line[-2] eq "push" && $line[-1] eq "%ebp" ) {
printf "STOP %08x\n", $cv if $DEBUG>3;
printf "START %08x\n", $cv if $DEBUG>3;
# This is a subroutine start [cf ref. #1] so name it
# if it doesn't already have a name.
#
if ( !length($FNS{$cv}) ) {
if ( length($name) && $NO_MATCH_DELIVERED_DEMOS ) { # This throws off sub names
$FNS{$cv} = $name;
} else {
$FNS{$cv} = "sub_".$subno;
$subno++;
}
}
} elsif ( $line[-2] eq "push" ) {
# Save last value pushed onto the stack as for
# the __libc_start_main call that value will be
# the address of main() [cf ref. #2]
#
$last_push = $line[-1];
$last_push =~ s/^\$//;
$last_push = hex($last_push);
} elsif ( $line[-2] eq "call" ) {
$addr = $line[-1];
if ( $addr !~ /%/ ) {
$addr = hex($addr);
# Check to see if the destination address of the call is in
# the text segment. If so, make sure it's not a fake
# localization call. [cf ref. #3]
#
if ( $addr >= ${ $SEC{".text"} }[0] && $addr < ${ $SEC{".text"} }[1] ) {
if ( $line[1] eq "e8" && $line[2] eq "00" && $line[3] eq "00"
&& $line[4] eq "00" && $line[5] eq "00" ) {
$FNS{$addr} = "fake_localization_call";
print $i,"\n" if $DEBUG>3;
printf "CALL= %08x\n", $addr if $DEBUG>3;
} else {
printf "CALL %08x\n", $addr if $DEBUG>3;
# Should probably keep a list of calls into the
# text segment as a double check for problems
# with the disassembly.
}
} else {
printf "CALL* %08x\n", $addr if $DEBUG>3;
}
# Starting program setup call. Last pushed address
# is main(). [cf ref. #2]
#
if ( $FNS{$addr} eq "__libc_start_main" ||
$NO_MATCH_DELIVERED_DEMOS && $FNS{$PLT{$addr}} eq "__libc_start_main" ) { # Fixes PLT lookup issue
$FNS{$last_push} = "main";
printf "main = %08x\n", $last_push if $DEBUG>3;
}
} else {
# If the call target is *%e__, then it's a register indirect call
# and we won't have information about the destination in most cases.
#
print "CALL- ", $addr, "\n" if $DEBUG>3;
}
} elsif ( $line[-2] =~ /^j/ ) {
# All branches start with "j" so this is a change of control and
# we note all destinations so that we can invalidate the registers
# at that point. More intensive branch analysis and register
# bookkeeping can avoid having to do this in all cases but
# beware loops/backward branches which cause problems.
#
$line[-1] =~ s/^0x//;
$Branches{hex("0x".$line[-1])}++;
print ">>> $line[-1]\n" if $DEBUG>3;
} else {
print ">>> Unused\n" if $DEBUG>4;
}
# reset last address value
#
$last_cv=$cv;
}
close(F);
# Print separator tag into the output file to show we've completed pass 1.
#
print "====\n";
# Second pass of text segment:
# Keep track of subroutines, registers, and branches as required to
# generate coherent subroutine calls with arguments. And generate
# output.
$drop_push_ebp=0; # flag to ignore subroutine start push
$reloc_flag=0; # relocation info present flag
@o_list = sort {$a <=> $b} keys %obj_list; # sorted list of object file info
$flag=0;
open(F, "$ARGV[0]/text") || die $!;
while ($i=<F>) {
# Same intro as previous loop.
#
if ( $i =~ /^$/ || $i =~ /^Disassembly/ ) {
$flag=0;
next;
} elsif ( $i =~ /<.text>:/ || $i =~ /<\w+>:/ ) {
$flag=1;
next;
} elsif ( !$flag ) {
next;
}
# Third verse, same as the first...
#
chomp($i);
$i =~ s/ <[\w+]+>$//;
@line=split(/\s+/, $i);
shift @line;
$line[0] =~ s/:$//;
$cv = hex("0x".$line[0]);
# If this is a real subroutine, we need to ignore the push %ebp
# as it isn't involved in a subroutine call.
#
if ( length($FNS{$cv}) && $FNS{$cv} ne "fake_localization_call" ) {
print "\n",$FNS{$cv},":\n";
$drop_push_ebp=1;
if ( $NO_MATCH_DELIVERED_DEMOS ) { # Fixes registers not invalidated over end of subroutine
undef @pargs;
undef @p2args;
undef %Regs;
}
}
# If this address is the target of a branch, reset registers to
# prevent incorrect answers and note that we have done so.
#
if ( $Branches{$cv} ) {
undef @pargs;
undef @p2args;
undef %Regs;
print "Branch-target\n";
}
# Check for object file relocation entry that applies to
# this line and set flag if so.
#
if ( $#o_list > -1 ) {
if ( $cv == $o_list[0] ) {
$reloc_flag=1;
shift @o_list;
print "*** <<$obj_ref{$cv}>> " if $DEBUG>5;
} elsif ( $cv > $o_list[0] ) {
print "ERROR: o_list mishandled $cv $o_list[0]\n";
}
}
# This case statement is the heart of the matter. This handles
# each assembly instruction and maintains state, generating
# whatever output is apropos. Each line that is recognized
# and processed has a '+' prepended when it is output.
#
# The heart of the heart is the if's that deal with call,
# mov, and push as these instructions are the ones that
# control function calls and argument setups. [cf refs. #4,#5]
# For push, @pargs contains the arguments. For mov,
# @p2args contains the arguments. There are a few cases
# where additional registers are pushed onto the stack
# and this can interfere with arguments to function calls
# but a little more bookkeeping will make those rare cases
# go away.
#
# There is a rare third case of the argument setup which
# seems to be only in optimized code. In this case
# the arguments passed in are left on the stack and
# implicitly referred to by function calls in the
# subroutine.
#
# All code that references $reloc_flag is intended for
# dealing with object files as this was out of scope
# the support is enough to help see what is going on
# but not sufficient for reliable use -- there
# are a number of different segments that would have
# to be supported for a more solid implementation.
if ( $#line == 1 ) {
# Single element -- nothing to do. Usually a long instruction length.
#
print "+",$i,"\n";
} elsif ( $line[-2] eq "push" ) {
# Clear rodata flag and set up for relocation entry if one exists.
#
$rod=0;
if ( $reloc_flag ) {
$line[-1] =~ s/^\$//;
$tmp=$line[-1];
$line[-1] = "<<" . $obj_ref{$cv} . "+" . $tmp . ">>";
}
# If the argument of the push is an address, check to see if there is a
# corresponding rodata string or if it is the address of a function
# (such as for signal(3)). Otherwise just print out the line
# with relocation info.
#
if ( $line[-1] =~ /^\$/ ) {
$line[-1] =~ s/^\$//;
$a = hex($line[-1]);
$str="";
if ( $a >= ${ $SEC{".rodata"} }[0] && $a < ${ $SEC{".rodata"} }[1] && ${ $SEC{".rodata"} }[0] ) {
$str = &get_rodata_str($a);
$rod=1;
} elsif ( $a >= ${ $SEC{".text"} }[0] && $a < ${ $SEC{".text"} }[1] && ${ $SEC{".text"} }[0] ) {
$str = length($FNS{$a}) ? $FNS{$a} : "unknown_text_addr";
} else {
; # Otherwise no xlation.
}
print "+",$i, length($str) ? "\t".$str : "" ,"\n";
} else {
print "+",$i, $reloc_flag ? " reloc $line[-1]\n" : "\n";
}
# Transfer argument to stack. If from an initial push %ebp, ignore it.
# Otherwise put rodata strings on preferentially or just the register/value.
#
if ( $line[-1] eq "%ebp" && $drop_push_ebp ) {
# Subroutine start, ignore
} else {
unshift(@pargs, $rod ? $str : $line[-1]);
}
} elsif ( $line[-2] eq "call" ) {
# Set the called address. Note the type of call and resolve the
# called address if possible.
#
if ( $line[-1] =~ /\*/ ) {
$addr = -1;
print "+",$i,"\tindirect_call\n";
} else {
$addr = hex($line[-1]);
print "+",$i,"\t",$FNS{$addr},"\n" if !$NO_MATCH_DELIVERED_DEMOS; # Old code
if ( !length($FNS{$addr}) ) {
if ( length($FNS{$PLT{$addr}}) ) { # Check for new style lib calls
$addr=$PLT{$addr};
} else {
print "!!! ERR: no sub name defined\n";
}
}
print "+",$i,"\t",length($FNS{$addr}) ? $FNS{$addr} : "UNKNOWN","\n" if $NO_MATCH_DELIVERED_DEMOS; # PLT support
}
# If this is a real call (not a fake localization call), then handle the args.
#
if ( $i =~ /\*/ || $FNS{$addr} ne "fake_localization_call" ) {
# Print the name of the function call or INDIR if indirect.
#
if ( -1 == $addr ) {
$sstr="INDIR";
print "INDIR";
} else {
$sstr=$FNS{$addr};
print $FNS{$addr};
}
# Decide which calling mode is in use. Favor mov's onto
# the stack. Print the arguments as we have them to go with
# the function name just printed. Then construct $sstr
# for use in loading registers with function return
# values.
#
if ( $#pargs > -1 && $#p2args == -1 ) {
$x=$,; $,=' '; print "(", @pargs, ")\n"; $,=$x;
$sstr .= "(-" . join(", ", @pargs) . ")";
} else {
# Rarely %eax is used as an implicit argument --
# seems to depend on optimizer and compiler version.
#
if ( !length($p2args[0]) ) {
if ( length($Regs{"eax"}) ) {
$p2args[0] = "%eax:" . $Regs{"%eax"};
} else {
$p2args[0] = "";
}
}
# Mark any empty spaces in the argument list --
# will flag any missed/improperly processed
# instructions.
#
for ($i=0; $i <= $#p2args; $i++) {
$p2args[$i] = "<<undef>>" if !length($p2args[$i]);
}
$x=$,; $,=' '; print "(", @p2args, ")\n"; $,=$x;
@pargs = @p2args;
$sstr .= "(+" . join(", ", @pargs) . ")";
}
# At this point we are prepared to handle a function call.
# $FNS{$addr} contains the function name and @pargs contains
# the arguments. $sstr contains the function call and arguments
# for use in registers. What remains is knowing what to do for
# each function name and how to massage the arguments to
# get what we want.
#
# I'm not clear on how all of these map into ACLs for subdomain
# so I've just decoded what I could and left it at that.
if ( $FNS{$addr} eq "chdir" ) {
# Print a warning since this can't be tracked w/o global
# context.
#
print "EMIT: WARN: chdir global $pargs[0]\n";
} elsif ( $FNS{$addr} eq "open" ) {
# If filename points to register, see if that register
# contains anything.
#
if ( $pargs[0] =~ /^%e..$/ && length($Regs{$pargs[0]}) ) {
$pargs[0] = $Regs{$pargs[0]};
}
# Decode open mode flags if constant, $rw gets
# read/write modes; $cf gets '+' for append.
#
$rw = "?"; $cf="";
if ( $pargs[1] =~ /^0x/ ) {
$tmp = hex($pargs[1]);
if ( ($tmp & 3) == 0 ) {
$rw="r";
} elsif ( ($tmp & 3) == 1 ) {
$rw="w";
} elsif ( ($tmp & 3) == 2 ) {
$rw="rw";
}
$cf="+" if ($tmp & 0x200);
}
print "EMIT: open $pargs[0] $rw $cf\n";
} elsif ( $FNS{$addr} eq "fopen" ) {
# fopen flags, while text, will need massaging to
# put into the proper ACL format.
#
print "EMIT: fopen $pargs[0] $pargs[1]\n";
} elsif ( $FNS{$addr} eq "opendir" ) {
# Read on foo
#
print "EMIT: opendir $pargs[0]\n";
} elsif ( $FNS{$addr} eq "creat" ) {
# Write on foo
#
print "EMIT: creat $pargs[0] w\n";
} elsif ( $FNS{$addr} eq "mkdir" ) {
# ??? ACLs?
#
print "EMIT: mkdir $pargs[0]\n";
} elsif ( $FNS{$addr} eq "unlink" ) {
# ??? Write?
#
print "EMIT: unlink $pargs[0]\n";
} elsif ( $FNS{$addr} eq "rename" ) {
# Delete on old, write on new?
#
print "EMIT: rename [" . $sstr . "]\n";
} elsif ( $FNS{$addr} eq "sprintf" ) {
# In this case put results into corresponding register in case
# it is used later on.
#
print "EMIT: sprintf [$sstr]\n";
# put in reg for later use
$Regs{$pargs[0]} = "[" . $sstr . "]";
} elsif ( $FNS{$addr} eq "snprintf" ) {
# In this case put results into corresponding register in case
# it is used later on.
#
print "EMIT: snprintf [$sstr]\n";
# put in reg for later use
$Regs{$pargs[0]} = "[" . $sstr . "]";
} elsif ( $FNS{$addr} eq "getenv" ) {
# In this case put results into corresponding register in case
# it is used later on.
#
$Regs{"%eax"} = "[" . $sstr . "]";
} elsif ( $FNS{$addr} eq "localtime" ) {
# Example 1 of a fixed library call. Whenever localtime(3) is
# called, an open on /etc/localtime will result.
#
print "EMIT: localtime \"/etc/localtime\" r\n";
} elsif ( $FNS{$addr} eq "openlog" ) {
# Example 2 of a fixed library call. Whenever openlog(3) is
# called, an open on /dev/log will result. Similar things
# happen for nameservice calls, *pwent, *grent, etc.
#
print "EMIT: openlog \"/dev/log\" r\n";
} elsif ( $FNS{$addr} =~ /^exec(l|lp|le|v|vp)$/ ) {
# we want to know about all the exec*() variants
print "EMIT: $FNS{$addr} [$sstr]\n";
} elsif ( $FNS{$addr} eq "system" ) {
# system is important too
print "EMIT: system [$sstr]\n";
}
}
# Reset arguments since we just passed a call.
#
undef @pargs;
undef @p2args;
} elsif ( $line[-2] eq "mov" && ( $line[-1] eq "%esp,%ebp" || $line[-1] eq "%ebp,%esp" ) ) {
# Begin / end of subroutine mov's
#
shift @pargs if !$NO_MATCH_DELIVERED_DEMOS; # Bug in previous. This is handled elsewhere
print "+",$i,"\n";
} elsif ( $line[-2] =~ /^mov/ && $line[-1] =~ /^(.*),(%e..)$/ ) {
# Save source, destination register from operands.
#
$str="";
$x=$1;
$y=$2;
# Check for $x being a local variable that we are tracking.
# If so, use that value to set the destination register.
# Then check for funky indirection modes and clear them out.
# Otherwise, check for relocation information and possible
# rodata string else use the values given.
# We set the "set time" of the register for use elsewhere
# for register indirection (cf #1#).
#
if ( $x =~ /^0xf[0-9a-f]+\(%ebp\)$/ && length($Regs{$x})) {
$Regs{$y} = $Regs{$x};
} elsif ( $x =~ /\(.*\),/ ) { # funky indirection modes
$Regs{$y} = $x;
} else {
if ( $reloc_flag ) {
$x =~ s/^$//;
if ( $obj_ref{$cv} eq ".rodata" ) {
$str = &get_rodata_str(hex($x));
$Regs{$y} = $str;
$str = "\t".$str;
} else {
$tmp = "<<" . $obj_ref{$cv} . "+" . $x . ">>";
$Regs{$y} = $tmp;
}
} else {
if ( $x =~ /^\$/ ) {
$x =~ s/^\$//;
$a = hex($x);
my $str="";
if ( $a >= ${ $SEC{".rodata"} }[0] && $a < ${ $SEC{".rodata"} }[1] && ${ $SEC{".rodata"} }[0] ) {
$str = &get_rodata_str($a);
$Regs{$y} = $str;
} else {
$Regs{$y} = $x;
}
} else {
$Regs{$y} = $x;
}
}
}
$Seta{$y} = $cv;
# Handle relocation information if any. If we were able to resolve
# a string, print it. Otherwise, note reloc info.
# Just mark the line as handled otherwise.
#
if ( $reloc_flag ) {
if ( length($str) ) {
print "+",$i,$str,"\n";
} else {
print "+",$i," reloc <<$obj_ref{$cv}>>\n";
}
} else {
print "+",$i,"\n";
}
print "== $y = $Regs{$y}\n";
} elsif ( $line[-2] =~ /^mov/ && $line[-1] =~ /\(%esp(,1)?\)/ ) {
# Extract offset and operand from a statement like:
# mov %eax,0x8(%esp,1)
# $x gets the 1st operand, $y gets the offset into the argument list.
#
$line[-1] =~ /^(.*),(0x.*)?\(%esp(,1)?\)$/;
$x=$1; $y=hex($2)/4;
if ( $x =~ /^\$/ ) {
# If the operand is a value, check for rodata string.
# Otherwise just put the literal value in the argument list.
#
$x =~ s/^\$//;
$a = hex($x);
if ( $a>= ${ $SEC{".rodata"} }[0] && $a< ${ $SEC{".rodata"} }[1] ) {
$str = &get_rodata_str($a);
print "+",$i,"\t",$str,"\n";
print "[",$y,"]=",$str,"\n";
$p2args[$y] = $str;
} else {
print "+",$i,"\n";
print "[",$y,"]=",$x,"\n";
$p2args[$y] = $x;
}
} elsif ( $x =~ /^%/ ) {
# If a register was set to the value of another register
# *and* that register has not been modified since, then
# substitute the second register's contents for the first.
# Otherwise just put the register name. #1#
#
print "+",$i,"\n";
if ( length($Regs{$x}) && $Seta{$x} > $Seta{$Regs{$x}} ) {
print "[",$y,"]=",$x," ==> ", $Regs{$x},"\n";
$p2args[$y] = $Regs{$x};
} else {
print "[",$y,"]=",$x,"\n";
$p2args[$y] = $x;
}
} else {
# Wasn't a format we deal with, so mark the line as unprocessed.
#
print "X",$i,"\n";
}
} elsif ( $line[-2] eq "pushl" ) {
# Push long (64-bit) quantity.
#
if ( $reloc_flag ) {
$line[-1] =~ s/^\$//;
$tmp=$line[-1];
$line[-1] = "<<" . $obj_ref{$cv} . "+" . $tmp . ">>";
print "+",$i," reloc $line[-1]\n";
} else {
print "+",$i,"\n";
}
unshift(@pargs, "L:".$line[-1]);
} elsif ( $line[-2] eq "add" || $line[-2] eq "sub" || $line[-2] eq "and" || $line[-2] eq "ror" ) {
# Not implementing these opcodes. Invalidate affected register.
#
$line[-1] =~ /^.*,(%e..)$/;
$Regs{$1} = undef;
print "+",$i, $reloc_flag ? " reloc <<$obj_ref{$cv}>>\n" : "\n";
} elsif ( $line[-2] =~ /^mov[lwb]?$/ ) {
# We only care about (%esp,1), (%esp), and register versions of mov opcodes
# and those are handled above.
#
print "+",$i, $reloc_flag ? " reloc <<$obj_ref{$cv}>>\n" : "\n";
} elsif ( $line[-2] =~ /^j/ ) {
# Branch/change of control. Reset regs and stack.
#
print "+",$i,"\n";
undef @pargs;
undef @p2args;
undef %Regs;
} elsif ( $line[-2] =~ /^test[bwl]?$/ || $line[-2] =~ /^cmp[bwl]?$/ ) {
# Just ignore. Nothing to do with comparison opcodes.
# Print relocation info if present.
#
print "+",$i, $reloc_flag ? " reloc <<$obj_ref{$cv}>>\n" : "\n";
} elsif ( $line[-3] eq "repz" || $line[-3] eq "repnz" || $line[-2] eq "setne" ) {
# Moving loop opcodes. Probably should invalidate related registers
# and not depend on the compiler to reinitialize them.
#
print "+",$i,"\n";
} elsif ( $line[-2] eq "pop" && $line[-1] eq "%ebp" ) {
# Just ignore. End of subroutine cruft.
#
print "+",$i,"\n";
} elsif ( $line[-2] eq "lea" && ( $line[-1] =~ /,(%e..)$/ || $line[-1] =~ /[sd]i.*[sd]i/ ) ) {
# Just ignore these as they are filler between subroutines.
#
$Regs{$1} = undef if length($1);
print "+",$i,"\n";
} elsif ( $line[-2] eq "xor" && $line[-1] =~ /^(%e..),\1$/ ) {
# xor-ing a register with itself is a shorthand for setting it to zero.
#
$Regs{$1} = 0;
print "+",$i,"\n";
} elsif ( $line[-2] eq "inc" || $line[-2] eq "dec" || $line[-2] eq "not" || $line[-2] eq "neg" ) {
# Not implementing these opcodes; just invalidate register contents.
#
$Regs{$line[-1]} = undef;
print "+",$i,"\n";
} elsif ( $line[-1] eq "hlt" ||
$line[-1] eq "nop" ||
$line[-1] eq "leave" ||
$line[-1] eq "ret" ) {
# These are all control flow related opcodes; all ignored.
#
print "+",$i,"\n";
} elsif ( $line[-2] eq "pop" ) {
# Currently not used to manipulate the stack.
# Stack is dumped after each "call".
#
print "+",$i,"\n";
} else {
# This line is not recognized nor processed.
#
print $i,"\n";
}
# Reset relocation info flag for next pass
#
$reloc_flag=0;
}
close(F);
# All done
#
exit 0;
#=========================================================================
#
# Subroutines:
# get_rodata_str:
# Takes a decimal address and returns the best representation
# of data at that address in the %RODATA hash as a string.
# String is returned in quotes and with most metacharacters
# replaced with '?'
#
sub get_rodata_str () {
my($addr) = @_;
my($str);
$str = "\"";
while ( $RODATA{$addr} ) {
if ( $RODATA{$addr} > 31 && $RODATA{$addr} < 127 ) {
$str .= sprintf "%c", $RODATA{$addr};
} elsif ( 10 == $RODATA{$addr} ) {
$str .= "\\n";
} elsif ( 9 == $RODATA{$addr} ) {
$str .= "\\t";
} else {
$str .= "?";
}
$addr++;
}
$str .= "\"";
return $str;
}
#=========================================================================
#
# References:
#
# Ref. #1:
# User subroutines always start begin with "push %ebp". If this
# changes, then a number of modifications of this script will be
# required. For example:
#
# 8049864: 55 push %ebp
#
#
#
# Ref. #2:
#
# This is the typical start() routine. Note the last item
# pushed on the stack is the address of main().
#
# 8049840: 31 ed xor %ebp,%ebp
# 8049842: 5e pop %esi
# 8049843: 89 e1 mov %esp,%ecx
# 8049845: 83 e4 f0 and $0xfffffff0,%esp
# 8049848: 50 push %eax
# 8049849: 54 push %esp
# 804984a: 52 push %edx
# 804984b: 68 50 d9 04 08 push $0x804d950 sub_75
# 8049850: 68 20 d9 04 08 push $0x804d920 sub_74
# 8049855: 51 push %ecx
# 8049856: 56 push %esi
# 8049857: 68 10 b2 04 08 push $0x804b210 main
# 804985c: e8 4f fd ff ff call 0x80495b0 __libc_start_main
# __libc_start_main( 0x804b210 %esi %ecx 0x804d920 0x804d950 %edx %esp %eax )
# 8049861: f4 hlt
# 8049862: 90 nop
# 8049863: 90 nop
#
#
#
# Ref. #3:
# A fake call used for getting the current execution address (PC).
# It is a call to relative address 0 (next instruction) and then
# the return address is popped off the stack for manipulation.
# I've only seen this in system code. For example:
#
# 8049869: e8 00 00 00 00 call 0x804986e fake_localization_call
# 804986e: 5b pop %ebx
# 804986f: 81 c3 e6 59 00 00 add $0x59e6,%ebx
# 8049875: 8b 83 84 01 00 00 mov 0x184(%ebx),%eax
#
#
#
# Ref. #4:
# Calling sequence based on mov. In this case operands are put in the stack
# by moving them to 0xXX(%esp,1) where 0xXX is 4 time the parameter number,
# e.g., 0x0 is first, 0x4 is second, etc. Variants include just (%esp).
#
# 8049bb6: c7 44 24 04 bd dd 04 movl $0x804ddbd,0x4(%esp,1) "%s"
# 8049bbd: 08
# 8049bbe: 8b 45 08 mov 0x8(%ebp),%eax
# 8049bc1: c7 04 24 03 00 00 00 movl $0x3,(%esp,1)
# 8049bc8: 89 44 24 08 mov %eax,0x8(%esp,1)
# 8049bcc: e8 bf f8 ff ff call 0x8049490 syslog
# syslog( 0x3 "%s" 0x8(%ebp) )
#
#
#
# Ref. #5:
# Calling sequence based on push. In this case operands are put on the stack in
# reverse order for the call. For example:
#
# 804b286: 68 aa 53 0c 08 push $0x80c53aa "none"
# 804b28b: a1 60 b4 21 08 mov 0x821b460,%eax
# 804b290: 50 push %eax
# 804b291: e8 d6 fb ff ff call 804ae6c strcasecmp
# strcasecmp( %eax "none" )