#!/usr/bin/perl #**************************************************************************** #* Copyright (c) Andrew Gross 2003-2004 #* All Rights Reserved #* #* The following information and material is confidential and proprietary #* information of Andrew Gross (the "Confidential Material") and #* is protected by copyright, patent, trade secrets and other intellectual #* and property laws. Access to this Confidential Material is limited to #* authorized employees and/or licensees. Any unauthorized use of the #* Confidential Material could subject the user to criminal and/or civil #* penalties. #* #* This work is intended for Immunix, Inc. (the "Client") on a #* work-for-hire basis pending contract completion and payment whereupon #* all rights confer to the Client. #* #**************************************************************************** # NB: This script works only on dynamically linked programs. Static # programs require a similar but different approach. use Data::Dumper; use strict; my ( $DEBUG, # debug printing control $addr, $byte, $data, $flag, $i, $t2, $t3, # counters and temps $first, $first_addr, $k, $last_cv, $last_push, $subno, $drop_push_ebp, $reloc_flag, $name, $cf, $rod, $rw, $sstr, $str, $tmp, $x, $y, %DATA, %FNS, %RELOC, %RODATA, %SEC, %PLT, # data store hashes %Branches, %obj_list, %obj_ref, @o_list, # for object file info @line, $cv, # text seg. loop info %Regs, %Seta, @pargs, @p2args, $NO_MATCH_DELIVERED_DEMOS, # turn off bug fixes to match delivered data ); # Comments: # # subroutine args: 0x8(%ebp) [argc], 0xc(%ebp) [argv] # # add'l instruction support / invalidate registers when necessary # # ACL generation and mapping: # bit of library stuff as example # +open +sprintf +chdir +creat +mkdir +opendir +rename # # rarely a routine will save extra registers at the beginning # and this hoses the first function call if push based args. # Debug printing setting. 0 is default, 5 is highest (most verbose) # #$DEBUG=5; $DEBUG=0; # Flag to disable improved functionality and bug fixes so that runs will # match demos delivered to the Client. # #$NO_MATCH_DELIVERED_DEMOS=0; $NO_MATCH_DELIVERED_DEMOS=1; #========================================================================= # # All supporting data is read in, parsed, and stored in this portion of the code. # # NB: In reading data all addresses are converted from hex and stored in decimal. #=== Get sections and address ranges # Read output from "objdump -x": # section start and end addresses are needed later so we can # decide how to translate addresses $flag=0; print "reading header...\n"; open(F, "$ARGV[0]/header") || die $!; while () { $flag=1 if /^Sections:$/; # when we've passed the header info next if /^Sections:$/; next if !$flag; # ignore the junk lines next if /ALLOC|CONTENTS|LOAD|READONLY|CODE|DATA/; next if /^Idx/; last if /^SYMBOL TABLE:/; # extract section start address, end address, and length: #> 0 .interp 00000013 080480f4 080480f4 000000f4 2**0 split; $t2 = hex ("0x".$_[2]); $t3 = hex ("0x".$_[3]); $SEC{$_[1]} = [ $t3, $t3+$t2, $t2 ]; # name = start, end, len } close(F); # Print the loaded section information # if ($DEBUG>4) { print "\n"; foreach $i (sort keys %SEC) { printf "%-20s\t%08x\t%08x\t%08x\n", $i, @{ $SEC{$i} }; } } # Print the loaded section information sorted by start address (for reference) # if ($DEBUG>4) { print "\n"; foreach $i (sort { ${ $SEC{$a} }[0] <=> ${ $SEC{$b} }[0] } keys %SEC) { printf "%-20s\t%08x\t%08x\t%08x\n", $i, @{ $SEC{$i} }; } } #=== Get symbol names and addresses (dynamic symbol table) # Read output from "objdump -T": # dynamic symbol information for xlating calls # print "reading dyn_syms...\n"; open(F, "$ARGV[0]/dyn_syms") || die $!; while () { # ignore junk lines next if /^$/; next if /file format elf32-i386/; next if /^DYNAMIC SYMBOL TABLE:/; # extract the address and dynamic symbol name (library functions typically) #> 08049260 DF *UND* 0000003a GLIBC_2.0 mkdir chomp; split; $_[0] =~ s/^0+//; next if $_[-1] eq "_start" && !$NO_MATCH_DELIVERED_DEMOS; # This bug fix throws off sub names $FNS{hex("0x".$_[0])} = $_[-1]; } close(F); # Print the loaded symbol information sorted by symbol name # if ($DEBUG>4) { print "\n"; foreach $i (sort keys %FNS) { printf "%08x\t%-20s\n", $i, $FNS{$i}; } } #=== Get symbol names and addresses (linker RR info) # Read output from "objdump -R": # dynamic relocation information also for xlating calls # print "reading dynam...\n"; open(F, "$ARGV[0]/dynam") || die $!; while () { # ignore junk lines next if /^$/; next if /file format elf32-i386/; next if /^DYNAMIC SYMBOL TABLE:/; next if /^OFFSET/; # extract the address and symbol name #> 0804f260 R_386_JUMP_SLOT mkdir chomp; split; $_[0] =~ s/^0+//; $FNS{hex("0x".$_[0])} = $_[-1]; } close(F); # Print the loaded relocation information sorted by symbol name # if ($DEBUG>4) { print "\n"; foreach $i (sort keys %FNS) { printf "%08x\t%-20s\n", $i, $FNS{$i}; } } #=== Get read only data (rodata) # Read output from "objdump -s --section=.rodata": # read only text data (string and other constants) # print "reading rodata...\n"; open(F, "$ARGV[0]/rodata") || die $!; while () { # ignore junk lines next if /^$/; next if /file format elf32-i386/; next if /^Contents of section/; # extract bytes and store with corresponding address key # in the %RODATA hash #> 804d9e0 03000000 01000200 756e6162 6c652074 ........unable t split; $addr=$_[0]; $addr = hex("0x".$addr); shift @_; $data=join(' ', @_); $data =~ s/ //g; while ( length($data) ) { $byte=substr($data,0,2); $byte = hex("0x".$byte); $RODATA{$addr}=$byte; $data=substr($data,2); $addr++; } } close(F); # Print the loaded string data # if ($DEBUG>4) { print "\n"; foreach $i (sort keys %RODATA) { printf "%08x\t%02x\n", $i, $RODATA{$i}; } } print Data::Dumper->Dump([\%RODATA]); #=== Get data (data) # Read output from "objdump -s --section=.data": # data section (for variable tracking and the odd string) # print "reading data...\n"; open(F, "$ARGV[0]/data") || die $!; while () { # ignore junk lines next if /^$/; next if /file format elf32-i386/; next if /^Contents of section/; # identical to RODATA section but stores in %DATA hash #> 804f000 00000000 00000000 4cf20408 19000000 ........L....... $addr=substr($_,0,8); $addr =~ s/ //g; $addr = hex("0x".$addr); $data=substr($_,9,35); $data =~ s/ //g; while ( length($data) ) { $byte=substr($data,0,2); $byte = hex("0x".$byte); $DATA{$addr}=$byte; $data=substr($data,2); $addr++; } } close(F); # Print the loaded data # if ($DEBUG>4) { print "\n"; foreach $i (sort keys %DATA) { printf "%08x\t%02x\n", $i, $DATA{$i}; } } #=== Get reloc symbol names and addresses # Read output from "objdump -r": # read in relocation information which contains # hints as to in which section a datum is stored # # Note: this is only for handling object files # as executables will not have data in this # section of the file. # print "reading reloc...\n"; open(F, "$ARGV[0]/reloc") || die $!; while () { # ignore junk lines next if /^$/; next if /file format elf32-i386/; next if /^RELOCATION RECORDS/; next if /^OFFSET/; chomp; split; # skip BSS symbols next if $_[2] eq ".bss"; # #> 00000098 R_386_32 .rodata # if a symbol (32 bit relocation), store with other symbols # else store with relocation symbols # if ( $_[1] eq "R_386_PC32" ) { $FNS{hex("0x".$_[0])} = $_[-1]; } else { $RELOC{hex("0x".$_[0])} = $_[-1]; } } close(F); # Print the loaded relocation symbols # if ($DEBUG>4) { print "\n"; foreach $i (sort {$a <=> $b} keys %RELOC) { printf "%08x\t%-20s\n", $i, $RELOC{$i}; } } # Note: data from the "objdump -t" is not currently used. This information # would only exist in an unstripped binary and is only of help # to a human reading the code. It only contains symbols internal # to the program and not of interest for library calls. # We can crib this from the text directly. #========================================================================= # # This portion of the code reads in the disassembly of the text segment. # The processing is done in two phases which correspond to the # two loops (the first split into two pieces) each of which makes # a complete pass of the text segment. The text segment data is # not stored in core due possible large size. # # The first pass pulls the PLT data as the newer compiler uses a # different relocation scheme; then the subroutines are found and named, # main() is located, and branch target addresses are found. # # The second pass does all of the bookkeeping required to generate # coherent subroutine call representations. # First pass of text segment, part 1: # Locate .plt section and pull jump information # which is required to xlate external library # calls under the newer compiler. $flag=0; print "reading text...\n"; open(F, "$ARGV[0]/text") || die $!; while ($i=) { # Skip lines until we find "<.plt>:", then # reset state (exit loop in this case) when # we finish that section (blank line). # Except for the last and section names # this is the same as for the other loops. # if ( $i =~ /^$/ || $i =~ /^Disassembly/ ) { last if $flag; $flag=0; next; } elsif ( $i =~ /<.plt>:/ ) { $flag=1; next; } elsif ( !$flag ) { next; } # We are looking for lines like this: #> 804a58c: ff 25 74 51 0e 08 jmp *0x80e5174 # # Library calls will be to this address and the destination # address will have to be dereferenced via the %FNS hash # removed unneeded trailer, split, and dump null 1st element # set $cv to current line address # chomp($i); $i =~ s/ <[\w+]+>$//; @line=split(/\s+/, $i); shift @line; $line[0] =~ s/:$//; $cv = hex("0x".$line[0]); # $cv == current value == address of current line # $line[0] =~ s/:$//; $cv = hex("0x".$line[0]); # if we have a jmp *addr line, enter the info into %PLT # if ( $line[-2] eq "jmp" && $line[-1] =~ /^\*/ ) { $line[-1] =~ s/^\*//; $PLT{$cv} = hex($line[-1]); } } # First pass of text segment, part 2: # In which subroutines are found and named, # and branch targets are identified. $first_addr=0; # first address of non-system code $first=1; # flag to make sure first subroutine is named $last_cv=-1; # address of the previous line (for object files) $subno="00"; # subroutine name counter $flag=0; while ($i=) { # Skip lines until we reach the beginning of the # actual "text" (either .text or an internal # subroutine name. # if ( $i =~ /^$/ || $i =~ /^Disassembly/ ) { $flag=0; next; } elsif ( $i =~ /<.text>:/ || $i =~ /<(\w+)>:/ ) { $name = $1; $flag=1; next; } elsif ( !$flag ) { next; } print ">>> ",$i,"\n" if $DEBUG>3; # Clean up line as above and set $cv to current address # chomp($i); $i =~ s/ <[\w+]+>$//; @line=split(/\s+/, $i); shift @line; $line[0] =~ s/:$//; $cv = hex("0x".$line[0]); # set $first_addr only once # $first_addr=$cv if !$first_addr; # For object files we need to know if there is a relocation # entry that refers to the previous line of assembly. If so, # note that for later xlation in loop #2. # foreach $k (sort {$a <=> $b} keys %RELOC) { next if $k < $last_cv; last if $k > $cv; $obj_list{$last_cv}++; $obj_ref {$last_cv}=$RELOC{$k}; } # Main logic of pass 1. Find start of subroutines, identify # address of main(), and branch target addresses. # if ( $first ) { # Special case for first line of code so that it is # always flagged as a subroutine. Clear first flag # and enter address with subroutine name. # $first=0; printf "START %08x\n", $cv if $DEBUG>4; if ( !length($FNS{$cv}) ) { $FNS{$cv} = "sub_".$subno; $subno++; } } elsif ( $line[-2] eq "push" && $line[-1] eq "%ebp" ) { printf "STOP %08x\n", $cv if $DEBUG>3; printf "START %08x\n", $cv if $DEBUG>3; # This is a subroutine start [cf ref. #1] so name it # if it doesn't already have a name. # if ( !length($FNS{$cv}) ) { if ( length($name) && $NO_MATCH_DELIVERED_DEMOS ) { # This throws off sub names $FNS{$cv} = $name; } else { $FNS{$cv} = "sub_".$subno; $subno++; } } } elsif ( $line[-2] eq "push" ) { # Save last value pushed onto the stack as for # the __libc_start_main call that value will be # the address of main() [cf ref. #2] # $last_push = $line[-1]; $last_push =~ s/^\$//; $last_push = hex($last_push); } elsif ( $line[-2] eq "call" ) { $addr = $line[-1]; if ( $addr !~ /%/ ) { $addr = hex($addr); # Check to see if the destination address of the call is in # the text segment. If so, make sure it's not a fake # localization call. [cf ref. #3] # if ( $addr >= ${ $SEC{".text"} }[0] && $addr < ${ $SEC{".text"} }[1] ) { if ( $line[1] eq "e8" && $line[2] eq "00" && $line[3] eq "00" && $line[4] eq "00" && $line[5] eq "00" ) { $FNS{$addr} = "fake_localization_call"; print $i,"\n" if $DEBUG>3; printf "CALL= %08x\n", $addr if $DEBUG>3; } else { printf "CALL %08x\n", $addr if $DEBUG>3; # Should probably keep a list of calls into the # text segment as a double check for problems # with the disassembly. } } else { printf "CALL* %08x\n", $addr if $DEBUG>3; } # Starting program setup call. Last pushed address # is main(). [cf ref. #2] # if ( $FNS{$addr} eq "__libc_start_main" || $NO_MATCH_DELIVERED_DEMOS && $FNS{$PLT{$addr}} eq "__libc_start_main" ) { # Fixes PLT lookup issue $FNS{$last_push} = "main"; printf "main = %08x\n", $last_push if $DEBUG>3; } } else { # If the call target is *%e__, then it's a register indirect call # and we won't have information about the destination in most cases. # print "CALL- ", $addr, "\n" if $DEBUG>3; } } elsif ( $line[-2] =~ /^j/ ) { # All branches start with "j" so this is a change of control and # we note all destinations so that we can invalidate the registers # at that point. More intensive branch analysis and register # bookkeeping can avoid having to do this in all cases but # beware loops/backward branches which cause problems. # $line[-1] =~ s/^0x//; $Branches{hex("0x".$line[-1])}++; print ">>> $line[-1]\n" if $DEBUG>3; } else { print ">>> Unused\n" if $DEBUG>4; } # reset last address value # $last_cv=$cv; } close(F); # Print separator tag into the output file to show we've completed pass 1. # print "====\n"; # Second pass of text segment: # Keep track of subroutines, registers, and branches as required to # generate coherent subroutine calls with arguments. And generate # output. $drop_push_ebp=0; # flag to ignore subroutine start push $reloc_flag=0; # relocation info present flag @o_list = sort {$a <=> $b} keys %obj_list; # sorted list of object file info $flag=0; open(F, "$ARGV[0]/text") || die $!; while ($i=) { # Same intro as previous loop. # if ( $i =~ /^$/ || $i =~ /^Disassembly/ ) { $flag=0; next; } elsif ( $i =~ /<.text>:/ || $i =~ /<\w+>:/ ) { $flag=1; next; } elsif ( !$flag ) { next; } # Third verse, same as the first... # chomp($i); $i =~ s/ <[\w+]+>$//; @line=split(/\s+/, $i); shift @line; $line[0] =~ s/:$//; $cv = hex("0x".$line[0]); # If this is a real subroutine, we need to ignore the push %ebp # as it isn't involved in a subroutine call. # if ( length($FNS{$cv}) && $FNS{$cv} ne "fake_localization_call" ) { print "\n",$FNS{$cv},":\n"; $drop_push_ebp=1; if ( $NO_MATCH_DELIVERED_DEMOS ) { # Fixes registers not invalidated over end of subroutine undef @pargs; undef @p2args; undef %Regs; } } # If this address is the target of a branch, reset registers to # prevent incorrect answers and note that we have done so. # if ( $Branches{$cv} ) { undef @pargs; undef @p2args; undef %Regs; print "Branch-target\n"; } # Check for object file relocation entry that applies to # this line and set flag if so. # if ( $#o_list > -1 ) { if ( $cv == $o_list[0] ) { $reloc_flag=1; shift @o_list; print "*** <<$obj_ref{$cv}>> " if $DEBUG>5; } elsif ( $cv > $o_list[0] ) { print "ERROR: o_list mishandled $cv $o_list[0]\n"; } } # This case statement is the heart of the matter. This handles # each assembly instruction and maintains state, generating # whatever output is apropos. Each line that is recognized # and processed has a '+' prepended when it is output. # # The heart of the heart is the if's that deal with call, # mov, and push as these instructions are the ones that # control function calls and argument setups. [cf refs. #4,#5] # For push, @pargs contains the arguments. For mov, # @p2args contains the arguments. There are a few cases # where additional registers are pushed onto the stack # and this can interfere with arguments to function calls # but a little more bookkeeping will make those rare cases # go away. # # There is a rare third case of the argument setup which # seems to be only in optimized code. In this case # the arguments passed in are left on the stack and # implicitly referred to by function calls in the # subroutine. # # All code that references $reloc_flag is intended for # dealing with object files as this was out of scope # the support is enough to help see what is going on # but not sufficient for reliable use -- there # are a number of different segments that would have # to be supported for a more solid implementation. if ( $#line == 1 ) { # Single element -- nothing to do. Usually a long instruction length. # print "+",$i,"\n"; } elsif ( $line[-2] eq "push" ) { # Clear rodata flag and set up for relocation entry if one exists. # $rod=0; if ( $reloc_flag ) { $line[-1] =~ s/^\$//; $tmp=$line[-1]; $line[-1] = "<<" . $obj_ref{$cv} . "+" . $tmp . ">>"; } # If the argument of the push is an address, check to see if there is a # corresponding rodata string or if it is the address of a function # (such as for signal(3)). Otherwise just print out the line # with relocation info. # if ( $line[-1] =~ /^\$/ ) { $line[-1] =~ s/^\$//; $a = hex($line[-1]); $str=""; if ( $a >= ${ $SEC{".rodata"} }[0] && $a < ${ $SEC{".rodata"} }[1] && ${ $SEC{".rodata"} }[0] ) { $str = &get_rodata_str($a); $rod=1; } elsif ( $a >= ${ $SEC{".text"} }[0] && $a < ${ $SEC{".text"} }[1] && ${ $SEC{".text"} }[0] ) { $str = length($FNS{$a}) ? $FNS{$a} : "unknown_text_addr"; } else { ; # Otherwise no xlation. } print "+",$i, length($str) ? "\t".$str : "" ,"\n"; } else { print "+",$i, $reloc_flag ? " reloc $line[-1]\n" : "\n"; } # Transfer argument to stack. If from an initial push %ebp, ignore it. # Otherwise put rodata strings on preferentially or just the register/value. # if ( $line[-1] eq "%ebp" && $drop_push_ebp ) { # Subroutine start, ignore } else { unshift(@pargs, $rod ? $str : $line[-1]); } } elsif ( $line[-2] eq "call" ) { # Set the called address. Note the type of call and resolve the # called address if possible. # if ( $line[-1] =~ /\*/ ) { $addr = -1; print "+",$i,"\tindirect_call\n"; } else { $addr = hex($line[-1]); print "+",$i,"\t",$FNS{$addr},"\n" if !$NO_MATCH_DELIVERED_DEMOS; # Old code if ( !length($FNS{$addr}) ) { if ( length($FNS{$PLT{$addr}}) ) { # Check for new style lib calls $addr=$PLT{$addr}; } else { print "!!! ERR: no sub name defined\n"; } } print "+",$i,"\t",length($FNS{$addr}) ? $FNS{$addr} : "UNKNOWN","\n" if $NO_MATCH_DELIVERED_DEMOS; # PLT support } # If this is a real call (not a fake localization call), then handle the args. # if ( $i =~ /\*/ || $FNS{$addr} ne "fake_localization_call" ) { # Print the name of the function call or INDIR if indirect. # if ( -1 == $addr ) { $sstr="INDIR"; print "INDIR"; } else { $sstr=$FNS{$addr}; print $FNS{$addr}; } # Decide which calling mode is in use. Favor mov's onto # the stack. Print the arguments as we have them to go with # the function name just printed. Then construct $sstr # for use in loading registers with function return # values. # if ( $#pargs > -1 && $#p2args == -1 ) { $x=$,; $,=' '; print "(", @pargs, ")\n"; $,=$x; $sstr .= "(-" . join(", ", @pargs) . ")"; } else { # Rarely %eax is used as an implicit argument -- # seems to depend on optimizer and compiler version. # if ( !length($p2args[0]) ) { if ( length($Regs{"eax"}) ) { $p2args[0] = "%eax:" . $Regs{"%eax"}; } else { $p2args[0] = ""; } } # Mark any empty spaces in the argument list -- # will flag any missed/improperly processed # instructions. # for ($i=0; $i <= $#p2args; $i++) { $p2args[$i] = "<>" if !length($p2args[$i]); } $x=$,; $,=' '; print "(", @p2args, ")\n"; $,=$x; @pargs = @p2args; $sstr .= "(+" . join(", ", @pargs) . ")"; } # At this point we are prepared to handle a function call. # $FNS{$addr} contains the function name and @pargs contains # the arguments. $sstr contains the function call and arguments # for use in registers. What remains is knowing what to do for # each function name and how to massage the arguments to # get what we want. # # I'm not clear on how all of these map into ACLs for subdomain # so I've just decoded what I could and left it at that. if ( $FNS{$addr} eq "chdir" ) { # Print a warning since this can't be tracked w/o global # context. # print "EMIT: WARN: chdir global $pargs[0]\n"; } elsif ( $FNS{$addr} eq "open" ) { # If filename points to register, see if that register # contains anything. # if ( $pargs[0] =~ /^%e..$/ && length($Regs{$pargs[0]}) ) { $pargs[0] = $Regs{$pargs[0]}; } # Decode open mode flags if constant, $rw gets # read/write modes; $cf gets '+' for append. # $rw = "?"; $cf=""; if ( $pargs[1] =~ /^0x/ ) { $tmp = hex($pargs[1]); if ( ($tmp & 3) == 0 ) { $rw="r"; } elsif ( ($tmp & 3) == 1 ) { $rw="w"; } elsif ( ($tmp & 3) == 2 ) { $rw="rw"; } $cf="+" if ($tmp & 0x200); } print "EMIT: open $pargs[0] $rw $cf\n"; } elsif ( $FNS{$addr} eq "fopen" ) { # fopen flags, while text, will need massaging to # put into the proper ACL format. # print "EMIT: fopen $pargs[0] $pargs[1]\n"; } elsif ( $FNS{$addr} eq "opendir" ) { # Read on foo # print "EMIT: opendir $pargs[0]\n"; } elsif ( $FNS{$addr} eq "creat" ) { # Write on foo # print "EMIT: creat $pargs[0] w\n"; } elsif ( $FNS{$addr} eq "mkdir" ) { # ??? ACLs? # print "EMIT: mkdir $pargs[0]\n"; } elsif ( $FNS{$addr} eq "unlink" ) { # ??? Write? # print "EMIT: unlink $pargs[0]\n"; } elsif ( $FNS{$addr} eq "rename" ) { # Delete on old, write on new? # print "EMIT: rename [" . $sstr . "]\n"; } elsif ( $FNS{$addr} eq "sprintf" ) { # In this case put results into corresponding register in case # it is used later on. # print "EMIT: sprintf [$sstr]\n"; # put in reg for later use $Regs{$pargs[0]} = "[" . $sstr . "]"; } elsif ( $FNS{$addr} eq "snprintf" ) { # In this case put results into corresponding register in case # it is used later on. # print "EMIT: snprintf [$sstr]\n"; # put in reg for later use $Regs{$pargs[0]} = "[" . $sstr . "]"; } elsif ( $FNS{$addr} eq "getenv" ) { # In this case put results into corresponding register in case # it is used later on. # $Regs{"%eax"} = "[" . $sstr . "]"; } elsif ( $FNS{$addr} eq "localtime" ) { # Example 1 of a fixed library call. Whenever localtime(3) is # called, an open on /etc/localtime will result. # print "EMIT: localtime \"/etc/localtime\" r\n"; } elsif ( $FNS{$addr} eq "openlog" ) { # Example 2 of a fixed library call. Whenever openlog(3) is # called, an open on /dev/log will result. Similar things # happen for nameservice calls, *pwent, *grent, etc. # print "EMIT: openlog \"/dev/log\" r\n"; } elsif ( $FNS{$addr} =~ /^exec(l|lp|le|v|vp)$/ ) { # we want to know about all the exec*() variants print "EMIT: $FNS{$addr} [$sstr]\n"; } elsif ( $FNS{$addr} eq "system" ) { # system is important too print "EMIT: system [$sstr]\n"; } } # Reset arguments since we just passed a call. # undef @pargs; undef @p2args; } elsif ( $line[-2] eq "mov" && ( $line[-1] eq "%esp,%ebp" || $line[-1] eq "%ebp,%esp" ) ) { # Begin / end of subroutine mov's # shift @pargs if !$NO_MATCH_DELIVERED_DEMOS; # Bug in previous. This is handled elsewhere print "+",$i,"\n"; } elsif ( $line[-2] =~ /^mov/ && $line[-1] =~ /^(.*),(%e..)$/ ) { # Save source, destination register from operands. # $str=""; $x=$1; $y=$2; # Check for $x being a local variable that we are tracking. # If so, use that value to set the destination register. # Then check for funky indirection modes and clear them out. # Otherwise, check for relocation information and possible # rodata string else use the values given. # We set the "set time" of the register for use elsewhere # for register indirection (cf #1#). # if ( $x =~ /^0xf[0-9a-f]+\(%ebp\)$/ && length($Regs{$x})) { $Regs{$y} = $Regs{$x}; } elsif ( $x =~ /\(.*\),/ ) { # funky indirection modes $Regs{$y} = $x; } else { if ( $reloc_flag ) { $x =~ s/^$//; if ( $obj_ref{$cv} eq ".rodata" ) { $str = &get_rodata_str(hex($x)); $Regs{$y} = $str; $str = "\t".$str; } else { $tmp = "<<" . $obj_ref{$cv} . "+" . $x . ">>"; $Regs{$y} = $tmp; } } else { if ( $x =~ /^\$/ ) { $x =~ s/^\$//; $a = hex($x); my $str=""; if ( $a >= ${ $SEC{".rodata"} }[0] && $a < ${ $SEC{".rodata"} }[1] && ${ $SEC{".rodata"} }[0] ) { $str = &get_rodata_str($a); $Regs{$y} = $str; } else { $Regs{$y} = $x; } } else { $Regs{$y} = $x; } } } $Seta{$y} = $cv; # Handle relocation information if any. If we were able to resolve # a string, print it. Otherwise, note reloc info. # Just mark the line as handled otherwise. # if ( $reloc_flag ) { if ( length($str) ) { print "+",$i,$str,"\n"; } else { print "+",$i," reloc <<$obj_ref{$cv}>>\n"; } } else { print "+",$i,"\n"; } print "== $y = $Regs{$y}\n"; } elsif ( $line[-2] =~ /^mov/ && $line[-1] =~ /\(%esp(,1)?\)/ ) { # Extract offset and operand from a statement like: # mov %eax,0x8(%esp,1) # $x gets the 1st operand, $y gets the offset into the argument list. # $line[-1] =~ /^(.*),(0x.*)?\(%esp(,1)?\)$/; $x=$1; $y=hex($2)/4; if ( $x =~ /^\$/ ) { # If the operand is a value, check for rodata string. # Otherwise just put the literal value in the argument list. # $x =~ s/^\$//; $a = hex($x); if ( $a>= ${ $SEC{".rodata"} }[0] && $a< ${ $SEC{".rodata"} }[1] ) { $str = &get_rodata_str($a); print "+",$i,"\t",$str,"\n"; print "[",$y,"]=",$str,"\n"; $p2args[$y] = $str; } else { print "+",$i,"\n"; print "[",$y,"]=",$x,"\n"; $p2args[$y] = $x; } } elsif ( $x =~ /^%/ ) { # If a register was set to the value of another register # *and* that register has not been modified since, then # substitute the second register's contents for the first. # Otherwise just put the register name. #1# # print "+",$i,"\n"; if ( length($Regs{$x}) && $Seta{$x} > $Seta{$Regs{$x}} ) { print "[",$y,"]=",$x," ==> ", $Regs{$x},"\n"; $p2args[$y] = $Regs{$x}; } else { print "[",$y,"]=",$x,"\n"; $p2args[$y] = $x; } } else { # Wasn't a format we deal with, so mark the line as unprocessed. # print "X",$i,"\n"; } } elsif ( $line[-2] eq "pushl" ) { # Push long (64-bit) quantity. # if ( $reloc_flag ) { $line[-1] =~ s/^\$//; $tmp=$line[-1]; $line[-1] = "<<" . $obj_ref{$cv} . "+" . $tmp . ">>"; print "+",$i," reloc $line[-1]\n"; } else { print "+",$i,"\n"; } unshift(@pargs, "L:".$line[-1]); } elsif ( $line[-2] eq "add" || $line[-2] eq "sub" || $line[-2] eq "and" || $line[-2] eq "ror" ) { # Not implementing these opcodes. Invalidate affected register. # $line[-1] =~ /^.*,(%e..)$/; $Regs{$1} = undef; print "+",$i, $reloc_flag ? " reloc <<$obj_ref{$cv}>>\n" : "\n"; } elsif ( $line[-2] =~ /^mov[lwb]?$/ ) { # We only care about (%esp,1), (%esp), and register versions of mov opcodes # and those are handled above. # print "+",$i, $reloc_flag ? " reloc <<$obj_ref{$cv}>>\n" : "\n"; } elsif ( $line[-2] =~ /^j/ ) { # Branch/change of control. Reset regs and stack. # print "+",$i,"\n"; undef @pargs; undef @p2args; undef %Regs; } elsif ( $line[-2] =~ /^test[bwl]?$/ || $line[-2] =~ /^cmp[bwl]?$/ ) { # Just ignore. Nothing to do with comparison opcodes. # Print relocation info if present. # print "+",$i, $reloc_flag ? " reloc <<$obj_ref{$cv}>>\n" : "\n"; } elsif ( $line[-3] eq "repz" || $line[-3] eq "repnz" || $line[-2] eq "setne" ) { # Moving loop opcodes. Probably should invalidate related registers # and not depend on the compiler to reinitialize them. # print "+",$i,"\n"; } elsif ( $line[-2] eq "pop" && $line[-1] eq "%ebp" ) { # Just ignore. End of subroutine cruft. # print "+",$i,"\n"; } elsif ( $line[-2] eq "lea" && ( $line[-1] =~ /,(%e..)$/ || $line[-1] =~ /[sd]i.*[sd]i/ ) ) { # Just ignore these as they are filler between subroutines. # $Regs{$1} = undef if length($1); print "+",$i,"\n"; } elsif ( $line[-2] eq "xor" && $line[-1] =~ /^(%e..),\1$/ ) { # xor-ing a register with itself is a shorthand for setting it to zero. # $Regs{$1} = 0; print "+",$i,"\n"; } elsif ( $line[-2] eq "inc" || $line[-2] eq "dec" || $line[-2] eq "not" || $line[-2] eq "neg" ) { # Not implementing these opcodes; just invalidate register contents. # $Regs{$line[-1]} = undef; print "+",$i,"\n"; } elsif ( $line[-1] eq "hlt" || $line[-1] eq "nop" || $line[-1] eq "leave" || $line[-1] eq "ret" ) { # These are all control flow related opcodes; all ignored. # print "+",$i,"\n"; } elsif ( $line[-2] eq "pop" ) { # Currently not used to manipulate the stack. # Stack is dumped after each "call". # print "+",$i,"\n"; } else { # This line is not recognized nor processed. # print $i,"\n"; } # Reset relocation info flag for next pass # $reloc_flag=0; } close(F); # All done # exit 0; #========================================================================= # # Subroutines: # get_rodata_str: # Takes a decimal address and returns the best representation # of data at that address in the %RODATA hash as a string. # String is returned in quotes and with most metacharacters # replaced with '?' # sub get_rodata_str () { my($addr) = @_; my($str); $str = "\""; while ( $RODATA{$addr} ) { if ( $RODATA{$addr} > 31 && $RODATA{$addr} < 127 ) { $str .= sprintf "%c", $RODATA{$addr}; } elsif ( 10 == $RODATA{$addr} ) { $str .= "\\n"; } elsif ( 9 == $RODATA{$addr} ) { $str .= "\\t"; } else { $str .= "?"; } $addr++; } $str .= "\""; return $str; } #========================================================================= # # References: # # Ref. #1: # User subroutines always start begin with "push %ebp". If this # changes, then a number of modifications of this script will be # required. For example: # # 8049864: 55 push %ebp # # # # Ref. #2: # # This is the typical start() routine. Note the last item # pushed on the stack is the address of main(). # # 8049840: 31 ed xor %ebp,%ebp # 8049842: 5e pop %esi # 8049843: 89 e1 mov %esp,%ecx # 8049845: 83 e4 f0 and $0xfffffff0,%esp # 8049848: 50 push %eax # 8049849: 54 push %esp # 804984a: 52 push %edx # 804984b: 68 50 d9 04 08 push $0x804d950 sub_75 # 8049850: 68 20 d9 04 08 push $0x804d920 sub_74 # 8049855: 51 push %ecx # 8049856: 56 push %esi # 8049857: 68 10 b2 04 08 push $0x804b210 main # 804985c: e8 4f fd ff ff call 0x80495b0 __libc_start_main # __libc_start_main( 0x804b210 %esi %ecx 0x804d920 0x804d950 %edx %esp %eax ) # 8049861: f4 hlt # 8049862: 90 nop # 8049863: 90 nop # # # # Ref. #3: # A fake call used for getting the current execution address (PC). # It is a call to relative address 0 (next instruction) and then # the return address is popped off the stack for manipulation. # I've only seen this in system code. For example: # # 8049869: e8 00 00 00 00 call 0x804986e fake_localization_call # 804986e: 5b pop %ebx # 804986f: 81 c3 e6 59 00 00 add $0x59e6,%ebx # 8049875: 8b 83 84 01 00 00 mov 0x184(%ebx),%eax # # # # Ref. #4: # Calling sequence based on mov. In this case operands are put in the stack # by moving them to 0xXX(%esp,1) where 0xXX is 4 time the parameter number, # e.g., 0x0 is first, 0x4 is second, etc. Variants include just (%esp). # # 8049bb6: c7 44 24 04 bd dd 04 movl $0x804ddbd,0x4(%esp,1) "%s" # 8049bbd: 08 # 8049bbe: 8b 45 08 mov 0x8(%ebp),%eax # 8049bc1: c7 04 24 03 00 00 00 movl $0x3,(%esp,1) # 8049bc8: 89 44 24 08 mov %eax,0x8(%esp,1) # 8049bcc: e8 bf f8 ff ff call 0x8049490 syslog # syslog( 0x3 "%s" 0x8(%ebp) ) # # # # Ref. #5: # Calling sequence based on push. In this case operands are put on the stack in # reverse order for the call. For example: # # 804b286: 68 aa 53 0c 08 push $0x80c53aa "none" # 804b28b: a1 60 b4 21 08 mov 0x821b460,%eax # 804b290: 50 push %eax # 804b291: e8 d6 fb ff ff call 804ae6c strcasecmp # strcasecmp( %eax "none" )