From 9595c27848fa3aa43378f1ba8f2906cf0d38b042 Mon Sep 17 00:00:00 2001 From: clc5q <clc5q@git.zephyr-software.com> Date: Thu, 13 Dec 2007 19:45:43 +0000 Subject: [PATCH] Added lots of code to improve IDA code identification using code addresses detected by objdump. --- SMPStaticAnalyzer.cpp | 869 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 860 insertions(+), 9 deletions(-) diff --git a/SMPStaticAnalyzer.cpp b/SMPStaticAnalyzer.cpp index e1f63231..0be81e5c 100644 --- a/SMPStaticAnalyzer.cpp +++ b/SMPStaticAnalyzer.cpp @@ -5,6 +5,11 @@ // (Software Memory Protection). // +#include <list> +#include <vector> + +#include <string> + #include <ida.hpp> #include <idp.hpp> #include <allins.hpp> @@ -14,6 +19,7 @@ #include <intel.hpp> #include <loader.hpp> #include <lines.hpp> +#include <nalt.hpp> #include <name.hpp> #include <ua.hpp> @@ -23,11 +29,13 @@ // Set to 1 for debugging output #define SMP_DEBUG 1 -#define SMP_DEBUG2 1 // verbose +#define SMP_DEBUG2 0 // verbose #define SMP_DEBUG3 0 // verbose #define SMP_DEBUG_MEM 0 // print memory operands #define SMP_DEBUG_TYPE0 0 // Output instr info for OptType = 0 #define SMP_DEBUG_ORPHANS 1 // find code outside of functions +#define SMP_DEBUG_CHUNKS 0 // restructuring tail chunks, shared chunks, etc. +#define SMP_DEBUG_DATA_ONLY 0 // Find & fix data addresses in code segments // Set to 1 when doing a binary search using SMP_DEBUG_COUNT to find // which function is causing a problem. @@ -35,6 +43,8 @@ #define SMP_DEBUG_COUNT 356 // How many funcs to process in problem search int FuncsProcessed = 0; +#define SMP_FIXUP_IDB 1 // Try to fix the IDA database? +#define SMP_DEBUG_FIXUP_IDB 0 // debugging output for FixupIDB chain // Define optimization categories for instructions. int OptCategory[NN_last+1]; @@ -61,11 +71,46 @@ static char *DataTypes[] = { "VOID", "NUMHEX", "NUMDEC", "CHAR", "STRUCTOFFSET", "STACKVAR", "NUMFLOAT", "UNKNOWN", "UNKNOWN", "UNKNOWN", 0}; +// Filename (not including path) of executable being analyzed. +static char RootFileName[MAXSTR]; + +// Code addresses identified by a disassembler, such as objdump on +// Linux. These can be used to improve the code vs. data identification +// of IDA Pro. +vector<ea_t> DisasmLocs; +// Code addresses as identified by IDA Pro, to be compared to DisasmLocs. +vector<ea_t> IDAProLocs; + +// Function start and end addresses (for function entry chunks only). +// Kept here because IDA Pro 5.1 seems to have a memory overwriting +// problem when iterating through all functions in the program. An existing +// func_t *ChunkInfo data structure was getting overwritten by one of the +// function func_t data structures, causing changes of startEA and endEA among +// other things. +struct SMP_bounds_t { + ea_t startEA; + ea_t endEA; +}; + +vector<SMP_bounds_t> FuncBounds; + +// List of functions that need to be reanalyzed after all the code fixup +// and code discovery is complete. Kept as a list of addresses; any address +// within the function is good enough to designate it. +list<ea_t> ReanalyzeList; + void IDAP_run(int); + +// Functions for diagnosing and/or fixing problems in the IDA database. +void FixupIDB(void); // Driver for all other fixing functions. +void FindDataInCode(void); +void AuditTailChunkOwnership(void); void FindOrphanedCode(segment_t *, FILE *); +void FixCodeIdentification(void); void AuditCodeTargets(void); ea_t FindNewFuncLimit(ea_t); void SpecialDebugOutput(void); +void RemoveIDACodeAddr(ea_t); static int idaapi idp_callback(void *, int event_id, va_list va) { if (event_id == ph.auto_empty_finally) { // IDA analysis is done @@ -105,34 +150,61 @@ void IDAP_term(void) { } void IDAP_run(int arg) { - segment_t *seg; char buf[MAXSTR]; ea_t ea; flags_t ObjFlags; bool ReadOnlyFlag; FILE *SymsFile; - char FuncName[MAXSTR]; SMPFunction *CurrFunc = NULL; bool FuncsDumped = false; +#if SMP_DEBUG2 + char FuncName[MAXSTR]; +#endif + #if SMP_DEBUG msg("Beginning IDAP_run.\n"); #endif // Open the output file. - SymsFile = qfopen("SMP.annot", "w"); + ssize_t FileLen; + FileLen = get_root_filename(RootFileName, sizeof(RootFileName) - 1); + string SymsFileName(RootFileName); + string FileSuffix(".annot"); + SymsFileName += FileSuffix; + SymsFile = qfopen(SymsFileName.c_str(), "w"); if (NULL == SymsFile) { - error("FATAL: Cannot open output file SMP.annot\n"); + error("FATAL: Cannot open output file %s\n", SymsFileName.c_str()); return; } (void) memset(OptCount, 0, sizeof(OptCount)); (void) memset(AnnotationCount, 0, sizeof(AnnotationCount)); - // Pre-audit the IDA database by seeing if all branches and calls - // have proper code targets and code cross references. - SpecialDebugOutput(); - AuditCodeTargets(); + // Record the start and end addresses for all function entry + // chunks in the program. + FuncBounds.reserve(10 + get_func_qty()); + for (size_t FuncIndex = 0; FuncIndex < get_func_qty(); ++FuncIndex) { + func_t *FuncInfo = getn_func(FuncIndex); + SMP_bounds_t temp; + temp.startEA = FuncInfo->startEA; + temp.endEA = FuncInfo->endEA; + FuncBounds.push_back(temp); + } + +#if SMP_DEBUG_DATA_ONLY + FindDataInCode(); + FixCodeIdentification(); + qfclose(SymsFile); + return; +#endif + + // Pre-audit the IDA database by seeing if the distinction + // between code and data can be improved, and if all branches + // and calls have proper code targets and code cross references. +#if SMP_FIXUP_IDB + FixupIDB(); +#endif // First, examine the data segments and print info about static // data, such as name/address/size. Do the same for functions in @@ -291,9 +363,11 @@ void IDAP_run(int arg) { delete CurrFunc; CurrFunc = NULL; } // end for (size_t FuncIndex = 0; ...) + #if SMP_DEBUG_ORPHANS FindOrphanedCode(seg, SymsFile); #endif + } // end else if (seg->type === SEG_CODE) else { #if SMP_DEBUG @@ -329,6 +403,779 @@ plugin_t PLUGIN = { IDAP_hotkey }; +// Find all code addresses in the IDA database and enter them into +// IDAProLocs. Find all code addresses identified by the external +// disassembler (e.g. objdump) and enter them into DisasmLocs. +void FindCodeAddresses(void) { + // Read in code addresses as found by an external disassembler. + ea_t CurrDisasmAddr; + string DisasmFileName(RootFileName); + string FileSuffix(".SMPobjdump"); + DisasmFileName += FileSuffix; + FILE *DisasmFile = qfopen(DisasmFileName.c_str(), "r"); + if (NULL == DisasmFile) { + error("FATAL: Cannot open input file %s\n", DisasmFileName.c_str()); + return; + } + +#define DISASM_RESERVE_SIZE 50000 + DisasmLocs.reserve(DISASM_RESERVE_SIZE); + int ScanReturn = qfscanf(DisasmFile, "%x", &CurrDisasmAddr); + while (1 == ScanReturn) { + int NextChar; + DisasmLocs.push_back(CurrDisasmAddr); + // Swallow the rest of the input line and get the next address. + do { + NextChar = qfgetc(DisasmFile); + } while ((EOF != NextChar) && ('\n' != NextChar)); + ScanReturn = qfscanf(DisasmFile, "%x", &CurrDisasmAddr); + } // end while (1 == ScanReturn) + if (0 >= DisasmLocs.size()) { + msg("ERROR: No addresses read from %s\n", DisasmFileName.c_str()); + qfclose(DisasmFile); + return; + } + else { + msg("%d Disasm addresses read from %s\n", DisasmLocs.size(), + DisasmFileName.c_str()); + qfclose(DisasmFile); + } + + // Find all the code locs in the IDA Pro database. As we find + // them, store them in IDAProLocs. + for (int SegIndex = 0; SegIndex < get_segm_qty(); ++SegIndex) { + segment_t *seg = getnseg(SegIndex); + if (SEG_CODE != seg->type) + continue; + + for (ea_t addr = seg->startEA; addr < seg->endEA; addr = get_item_end(addr)) { + flags_t InstrFlags = getFlags(addr); + if (isHead(InstrFlags) && isCode(InstrFlags)) { + IDAProLocs.push_back(addr); + if ((0x806cda4 <= addr) && (0x806cf99 >= addr)) + msg("IDA code addr: %x\n", addr); + } // end if (isHead(addr) && isCode(addr) +#if SMP_DEBUG_FIXUP_IDB + else if ((0x806cda4 <= addr) && (0x806cf99 >= addr)) { + if (!isHead(InstrFlags)) + msg("Weirdness: not isHead at %x\n", addr); + if (isUnknown(InstrFlags)) { + msg("Weirdness: isUnknown at %x\n", addr); + } + } +#endif + } // end for (ea_t addr = seg->startEA; ...) + } // end for (int SegIndex = 0; ...) + return; +} // end FindCodeAddresses() + +// Return true if addr is not a proper beginning address for an instruction. +// Return false otherwise. +// Currently, we claim that an instruction is misaligned if DisasmLocs does +// not contain it. This function is useful for dealing with errors in IDA +// code identification, in which a large code section is identified as data, +// but some instructions in the middle of the "data" are identified as +// code but IDA often starts on the wrong boundary in these cases. +bool IsCodeMisaligned(ea_t addr) { + // Do a binary search for addr within DisasmLocs, which is sorted + // in ascending address order because of the way in which it was + // generated. + size_t min = 0; + size_t max = DisasmLocs.size(); // don't access DisasmLocs[max] + size_t index = (min + max) / 2; + + while (addr != DisasmLocs[index]) { + if (min >= (max - 1)) + return true; +#if 0 + msg("min: %d max: %d index: %d\n", min, max, index); +#endif + if (addr < DisasmLocs[index]) + max = index; + else // must be addr > DisasmLocs[index]; + min = index; + + index = (min + max) / 2; + } + + return false; +} // end of IsCodeMisaligned() + +void RemoveIDACodeAddr(ea_t addr) { + // Do a binary search for addr within IDAProLocs, which is sorted + // in ascending address order because of the way in which it was + // generated. Delete the element of IDAProLocs if found. + size_t min = 0; + size_t max = IDAProLocs.size(); // don't access IDAProLocs[max] + size_t index = (min + max) / 2; + + while (addr != IDAProLocs[index]) { + if (min >= (max - 1)) + return; +#if 0 + msg("min: %d max: %d index: %d\n", min, max, index); +#endif + if (addr < IDAProLocs[index]) + max = index; + else // must be addr > IDAProLocs[index]; + min = index; + + index = (min + max) / 2; + } + + // IDAProLocs[index] contains addr. + vector<ea_t>::iterator RemovalIterator = IDAProLocs.begin(); + RemovalIterator += index; + RemovalIterator = IDAProLocs.erase(RemovalIterator); + return; +} // end of RemoveIDACodeAddr() + +// Driver for all other fixing functions. Upon its return, the IDA +// database (IDB file) should be fixed up as much as we can fix it. +void FixupIDB(void) { + FindCodeAddresses(); +#if SMP_DEBUG_FIXUP_IDB + SpecialDebugOutput(); +#endif + AuditCodeTargets(); + FindDataInCode(); + AuditTailChunkOwnership(); + FixCodeIdentification(); +} // end of FixupIDB() + +// Find and print all data head addresses in code segments. +// If an isolated code instruction is found in the midst of a run +// of data bytes and has no code xrefs jumping to it, it is not +// reachable as code and is undoubtedly a mixup by IDA. Possibly +// the whole data region will be converted to code later, in which +// case the isolated code is not necessarily properly aligned and +// parsed at its present address, so we are glad to convert it into +// data anyway so that FindDataToConvert() will succeed on it later. +// Data to code conversion, and isolated code detection, are inhibited +// by IDA identifying several consecutive instructions in the midst +// of a data region, with the code addresses not agreeing with the +// external disassembler's code addresses. We will convert these +// misaligned instructions to data as we detect them. We will also +// convert unexplored bytes (isUnknown(flags) == true) into data if +// they are in the midst of a data sequence. +#define MIN_DATARUN_LEN 24 // #bytes on either side of "isolated" code +void FindDataInCode(void) { + size_t DataRunLen = 0; // How many data bytes in a row have we seen? + bool IsolatedCodeTrigger = false; // Have seen data, then isolated code + // Now looking for data + ea_t IsolatedCodeAddr; + int IsolatedCodeLen; + int InstrLen; + + for (int SegIndex = 0; SegIndex < get_segm_qty(); ++SegIndex) { + char SegName[MAXSTR]; + segment_t *seg = getnseg(SegIndex); + ssize_t SegNameSize = get_segm_name(seg, SegName, sizeof(SegName) - 1); + if (SEG_CODE != seg->type) + continue; +#if SMP_DEBUG_FIXUP_IDB + msg("Non-code addresses for code segment %s from %x to %x\n", + SegName, seg->startEA, seg->endEA); +#endif + for (ea_t addr = seg->startEA; addr < seg->endEA; addr = get_item_end(addr)) { + flags_t AddrFlags = getFlags(addr); + if (isHead(AddrFlags)) { + if (isData(AddrFlags)) { + DataRunLen += get_item_size(addr); +#if SMP_DEBUG_FIXUP_IDB + msg("Data: %x\n", addr); +#endif + if (MIN_DATARUN_LEN <= DataRunLen) { + if (IsolatedCodeTrigger) { + // Saw data, then one isolated code, then data + do_unknown_range(IsolatedCodeAddr, IsolatedCodeLen, DOUNK_SIMPLE); + RemoveIDACodeAddr(IsolatedCodeAddr); + if (do_data_ex(IsolatedCodeAddr, byteflag(), + IsolatedCodeLen, BADNODE)) { + msg("Converted isolated code to data: %x\n", + IsolatedCodeAddr); + } + else { + msg("Failed to convert isolated code to data: %x len: %x\n", + IsolatedCodeAddr, IsolatedCodeLen); + } + IsolatedCodeTrigger = false; + } // end if (IsolatedCodeTrigger) + } // end if (MIN_DATARUN_LEN <= DataRunLen) + } // end if (isData(AddrFlags) + else if (isUnknown(AddrFlags)) { + // Just in case; unknown usually means not head or tail + // If in a data run, convert to data. + InstrLen = get_item_size(addr); + msg("Unknown: %x len: %x\n", addr, InstrLen); + if (0 < DataRunLen) { + if (do_data_ex(addr, byteflag(), InstrLen, BADNODE)) { + msg("Converted unknown to data at %x len: %x\n", addr, InstrLen); + DataRunLen += InstrLen; + } + else { + msg("Failed to convert unknown to data at %x len: %x\n", addr, InstrLen); + DataRunLen = 0; + IsolatedCodeTrigger = false; + } + } + } + else if (isCode(AddrFlags)) { // must be true + if (MIN_DATARUN_LEN <= DataRunLen) { + msg("DataRunLen: %d at %x\n", DataRunLen, addr); + InstrLen = ua_ana0(addr); +#if SMP_DEBUG_FIXUP_IDB + msg("Calling IsCodeMisaligned: len %d\n", InstrLen); +#endif + if (IsCodeMisaligned(addr)) { +#if SMP_DEBUG_FIXUP_IDB + msg("Code was misaligned.\n"); +#endif + do_unknown_range(addr, InstrLen, DOUNK_SIMPLE); + RemoveIDACodeAddr(addr); + if (do_data_ex(addr, byteflag(), InstrLen, BADNODE)) { + msg("Converted misaligned code to data at %x : len: %x\n", + addr, InstrLen); + // Step back so data gets processed. + DataRunLen += get_item_size(addr); + continue; // skip reset of DataRunLen + } + else { + msg("Misaligned code left as unknown at %x : len: %x\n", + addr, InstrLen); + IsolatedCodeTrigger = false; + } + } // end if (IsCodeMisaligned() ...) + else if (!hasRef(AddrFlags)) { + // No references at all --> isolated code. + IsolatedCodeTrigger = true; + IsolatedCodeAddr = addr; + IsolatedCodeLen = InstrLen; + } + else { + xrefblk_t xb; + bool ok = xb.first_to(IsolatedCodeAddr, XREF_ALL); + if (!ok) { + // No code xrefs to this target addr. + IsolatedCodeTrigger = true; + IsolatedCodeAddr = addr; + IsolatedCodeLen = InstrLen; + } + } + } // end if (MIN_DATARUN_LEN <= DataRunLen) + else if (IsolatedCodeTrigger) { + // Two instructions in a row does not fit the pattern. + IsolatedCodeTrigger = false; + } + DataRunLen = 0; + } // end if (isData) ... else if (isUnknown) ... else isCode + } // end if (isHead) + else if (isUnknown(AddrFlags)) { + // If in a data run, convert to data. + InstrLen = get_item_size(addr); + msg("Unknown: %x len: %x\n", addr, InstrLen); + if (0 < DataRunLen) { + if (do_data_ex(addr, byteflag(), InstrLen, BADNODE)) { + msg("Converted unknown to data at %x len: %x\n", addr, InstrLen); + DataRunLen += InstrLen; + } + else { + msg("Failed to convert unknown to data at %x len: %x\n", addr, InstrLen); + DataRunLen = 0; + IsolatedCodeTrigger = false; + } + } + } + } // end for (ea_t addr = seg->startEA; ...) + } // end for (int SegIndex = 0; ...) + return; +} // end of FindDataInCode() + + +// The choices that IDA makes for deciding which parent function of a +// TAIL chunk is the primary owner of the tail can be counterintuitive. +// A function entry can both fall into and jump to a tail chunk that +// is contiguous with it, yet the "owner" might be a function that is +// far below it in the executable address space. This function will +// change the ownership to a more sensible arrangement. +void AuditTailChunkOwnership(void) { + char FuncName[MAXSTR]; + // Iterate through all chunks in the program. + for (size_t ChunkIndex = 0; ChunkIndex < get_fchunk_qty(); ++ChunkIndex) { + func_t *ChunkInfo = getn_fchunk((int) ChunkIndex); + if (is_func_tail(ChunkInfo)) { + // For each TAIL chunk, find all the parent chunks. Find the last + // parent chunk with an address less than the TAIL chunk address. + ea_t BestCandidate = 0; + func_parent_iterator_t FuncParent(ChunkInfo); +#if SMP_DEBUG_CHUNKS + msg("Tail chunk: %x ", ChunkInfo->startEA); +#endif + for (bool ok = FuncParent.first(); ok; ok = FuncParent.next()) { + ea_t parent = FuncParent.parent(); +#if SMP_DEBUG_CHUNKS + msg(" parent: %x ", parent); +#endif + if ((parent > BestCandidate) && (parent < ChunkInfo->startEA)) + BestCandidate = parent; + } +#if SMP_DEBUG_CHUNKS + msg("\n"); +#endif + // Make the best parent chunk the owner of the TAIL chunk if it is + // not already the owner. + if (ChunkInfo->owner != BestCandidate) { + if (0 < BestCandidate) { + if (set_tail_owner(ChunkInfo, BestCandidate)) { + func_t *FuncInfo = get_func(BestCandidate); + msg("Set %x as new owner of tail %x\n", + BestCandidate, ChunkInfo->startEA); + // Reanalyze the parent function (and all its + // tail chunks) now that the structure has changed. + reanalyze_function(FuncInfo); + } + else { + msg("set_tail_owner failed for tail %x and parent %x\n", + ChunkInfo->startEA, BestCandidate); + } + } + else { + func_t *FuncInfo = get_func(ChunkInfo->owner); + get_func_name(FuncInfo->startEA, FuncName, sizeof(FuncName) - 1); +#if SMP_DEBUG_CHUNKS + msg("No good parent candidate before tail at %x\n", + ChunkInfo->startEA); + msg("Current parent is %x: %s\n", FuncInfo->startEA, FuncName); +#endif + // Find out if a function entry chunk that comes before the + // tail is a better candidate for the owner (i.e. it falls + // through to the tail, or jumps to it). + BestCandidate = 0; +#if SMP_DEBUG_CHUNKS + msg("Finding parent func candidates for %x:", ChunkInfo->startEA); +#endif + SMP_bounds_t CurrFunc; + for (size_t FuncIndex = 0; FuncIndex < FuncBounds.size(); ++FuncIndex) { + CurrFunc = FuncBounds[FuncIndex]; + if ((CurrFunc.startEA < ChunkInfo->startEA) + && (CurrFunc.startEA > BestCandidate)) { + BestCandidate = CurrFunc.startEA; +#if SMP_DEBUG_CHUNKS + msg(" candidate: %x tail: %x", BestCandidate, + ChunkInfo->startEA); +#endif + } + else { +#if SMP_DEBUG_CHUNKS + msg(" not a candidate: %x tail: %x best: %x\n", + CurrFunc.startEA, ChunkInfo->startEA, BestCandidate); +#endif + break; + } + } // end for (size_t FuncIndex = 0; ...) + if (0 >= BestCandidate) { // highly unlikely + msg("No good func entry parent candidate.\n"); + } + else { + FuncInfo = get_func(BestCandidate); + get_func_name(FuncInfo->startEA, FuncName, sizeof(FuncName) - 1); +#if SMP_DEBUG_CHUNKS + msg("Best func entry parent candidate: %s at %x", + FuncName, BestCandidate); + if (FuncInfo->endEA == ChunkInfo->startEA) + msg(" Function endEA == tail chunk startEA"); + msg("\n"); +#endif + } + } + } // end if (ChunkInfo->owner != BestCandidate) +#if SMP_DEBUG_CHUNKS + else { + msg("Already best parent for %x is %x\n", ChunkInfo->startEA, + ChunkInfo->owner); + } +#endif + } // end if (is_func_tail(ChunkInfo)) + } // end for (size_t ChunkIndex = 0; ...) + + return; +} // end of AuditTailChunkOwnership() + +// If the addresses signified from DisasmIndex to IDAProIndex are +// all considered data and do NOT follow a return instruction, +// return false and update AreaSize to reflect the area to be +// converted. +// Return value: true -> skip to IDAProIndex; false -> convert AreaSize bytes. +bool FindDataToConvert(size_t IDAProIndex, size_t DisasmIndex, int &AreaSize) { + ea_t PrevIDAAddr; + ea_t NextIDAAddr; + size_t ShadowDisasmIndex = DisasmIndex - 1; + ea_t DisasmAddr = DisasmLocs[ShadowDisasmIndex]; + bool CannotConvert = false; // return value + bool DebugAddress = false; +#if SMP_DEBUG_FIXUP_IDB + DebugAddress = (DisasmAddr == 0x806c19a); +#endif + + if (DebugAddress) { + msg("IDAProIndex: %d DisasmIndex: %d\n", IDAProIndex, DisasmIndex); + msg("IDA locs size %d Disasm locs size %d\n", IDAProLocs.size(), + DisasmLocs.size()); + } + if (IDAProIndex >= IDAProLocs.size()) { + // Have already processed the last IDA address. + if (DebugAddress) msg(" Already done with IDAProLocs.\n"); + return true; + } + else if (DisasmIndex >= DisasmLocs.size()) { + // Strange. Last Disasm address is only one to convert, and + // IDA still has addresses after that? + if (DebugAddress) msg(" Already done with DisasmLocs.\n"); + return true; + } + else if (IDAProIndex < 2) { + // We have Disasm addrs before the very first IDA addr. We + // don't trust this boundary case. + if (DebugAddress) msg(" Boundary case with IDAProLocs.\n"); + return true; + } + NextIDAAddr = IDAProLocs[IDAProIndex - 1]; + PrevIDAAddr = IDAProLocs[IDAProIndex - 2]; + if (DebugAddress) msg(" PrevIDAAddr: %x NextIDAAddr: %x\n", PrevIDAAddr, NextIDAAddr); + + // See if previous IDA address was a return. + flags_t PrevFlags = getFlags(PrevIDAAddr); + if (!isCode(PrevFlags) || !isHead(PrevFlags)) { + msg("PrevIDAAddr %x not isCode or not isHead.\n", PrevIDAAddr); + return true; + } + SMPInstr PrevInstr(PrevIDAAddr); + PrevInstr.Analyze(); + if (DebugAddress) msg("Finished PrevInstr.Analyze()\n"); + if (PrevInstr.MDIsReturnInstr()) { + // Right after a return come no-ops and 2-byte no-ops + // that are just for alignment. IDA does not seem to be + // happy when we convert all those to code. + if (DebugAddress) msg(" Data followed a return instruction.\n"); + return true; + } + // Now, see if the area from DisasmAddr to NextIDAAddr is all data + // according to IDA. + while (DisasmAddr < NextIDAAddr) { + flags_t DataFlags = getFlags(DisasmAddr); + if (isTail(DataFlags)) { + if (DebugAddress) msg(" tail byte: %x\n", DisasmAddr); + DisasmAddr = get_item_end(DisasmAddr); + } + else if (isData(DataFlags)) { + if (DebugAddress) msg(" data byte: %x\n", DisasmAddr); + DisasmAddr = get_item_end(DisasmAddr); + } + else if (isCode(DataFlags)) { + // How could this ever happen? + if (DebugAddress) msg(" isCode: %x\n", DisasmAddr); + return true; + } + else { // must be isUnknown() + // Very conservative here; only want to convert when the whole + // region is data, because that is a symptom of IDA missing + // a piece of code within a function (usually a piece of code + // that is only reachable via an indirect jump). + if (DebugAddress) msg(" Not isData: %x\n", DisasmAddr); + return true; + } + if (DebugAddress) msg(" new DisasmAddr: %x\n", DisasmAddr); + } // end while (DisasmAddr < NextIDAAddr) + if (DebugAddress) msg(" loop exit CannotConvert: %d\n", CannotConvert); + if (!CannotConvert) { + // Success. + DisasmAddr = DisasmLocs[ShadowDisasmIndex]; + AreaSize = NextIDAAddr - DisasmAddr; + if (DebugAddress) { + msg(" Success! AreaSize: %x Old index: %d new index: %d\n", + AreaSize, ShadowDisasmIndex, DisasmIndex); + msg(" exiting FindDataToConvert()\n"); + msg("\n"); + } + } // end if (!CannotConvert) + return CannotConvert; +} // end of FindDataToConvert() + +// Does a converted code region look like a function prologue? If so, +// we should not include it in the previous function. +bool IsFunctionPrologue(ea_t StartAddr, ea_t EndAddr) { + return false; // **!!** TODO +} // end of IsFunctionPrologue() + +// Patch program bytes that could not be converted from +// data to code, if it can be determined that the bytes represent code +// that IDA has a hard time with. +// Currently limited to finding "call near ptr 0" instructions, which +// often are found in optimized glibc code because gcc was able to +// determine that a function pointer was zero and did constant propagation, +// but unfortunately was not able to determine that the code was unreachable. +// IDA will not succeed in ua_code() for "call 0", but there is no danger +// of a working program ever executing this code. Replacing the call with +// no-ops permits us to continue converting a contiguous range of data to +// code, and permits IDA to reanalyze the function later. +// Returns true if program bytes were patched. +bool MDPatchUnconvertedBytes(ea_t CurrDisasmAddr) { + flags_t AddrFlags = getFlags(CurrDisasmAddr); + if (isData(AddrFlags) || isTail(AddrFlags)) { + // Bytes should have been converted to unknown already. + msg("Cannot patch data bytes or tail bytes at %x\n", CurrDisasmAddr); + return false; + } + SMPInstr PatchInstr(CurrDisasmAddr); + PatchInstr.Analyze(); + int InstrLen = PatchInstr.GetCmd().size; + if (0 >= InstrLen) { + msg("ua_ana0() failed on patch location %x\n", CurrDisasmAddr); + return false; + } + else { + if (PatchInstr.GetCmd().itype != NN_call) { + msg("Cannot patch non-call instruction at %x\n", CurrDisasmAddr); + return false; + } + PatchInstr.PrintOperands(); + op_t CallDest = PatchInstr.GetUse(0); + if ((o_near != CallDest.type) || (0 != CallDest.addr)) { + msg("Cannot patch call unless it is call near ptr 0 at %x", + CurrDisasmAddr); + return false; + } + ea_t PatchAddr = CurrDisasmAddr; + for (int i = 0; i < InstrLen; ++i) { + bool ok = patch_byte(PatchAddr, 0x90); // x86 no-op + if (!ok) { + msg("patch_byte() failed at %x\n", PatchAddr); + return false; + } + ++PatchAddr; + } + msg("Patched %d bytes successfully at %x\n", InstrLen, CurrDisasmAddr); + InstrLen = ua_code(CurrDisasmAddr); + if (0 >= InstrLen) { + msg(" ... but ua_code() still failed!\n"); + return false; + } + } // end if (0 >= InstrLen) ... else ... + return true; +} // end of MDPatchUnconvertedBytes() + +// Create lists of code addresses identified by IDA Pro (in IDAProLocs) +// and an external disassembler (in DisasmLocs). Compare the lists and +// try to convert addresses to code that are found in DisasmLocs but +// not in IDAProLocs. Emit warnings when IDAProLocs has a code address +// not found in DisasmLocs. +void FixCodeIdentification(void) { + size_t DisasmIndex = 0; + ea_t CurrDisasmAddr = DisasmLocs[DisasmIndex++]; + size_t IDAProIndex = 0; + ea_t CurrAddr = IDAProLocs[IDAProIndex++]; + + while (DisasmIndex <= DisasmLocs.size()) { + // If the current address is less than the current + // external disasm address, we have the rare case in + // which IDA Pro has identified an address as code + // but the external disasm has not. Emit a warning + // message and go on to the next IDA address. + if (CurrAddr < CurrDisasmAddr) { + SMPInstr TempInstr(CurrAddr); + TempInstr.Analyze(); + msg("Address %x is code in IDB but not in external disassembler: %s\n", + CurrAddr, TempInstr.GetDisasm()); + if (IDAProIndex < IDAProLocs.size()) + CurrAddr = IDAProLocs[IDAProIndex++]; + else { + // Last IDA addr; might still process Disasm addrs + // after loop exit. + break; + } + } + else if (CurrAddr == CurrDisasmAddr) { + // If equal, no problem, we are moving through the + // code addresses in lockstep. Grab the next address + // from each source. + if (DisasmIndex < DisasmLocs.size()) { + CurrDisasmAddr = DisasmLocs[DisasmIndex++]; + } + else { + ++DisasmIndex; // cause loop exit; skip cleanup loop + } + if (IDAProIndex < IDAProLocs.size()) + CurrAddr = IDAProLocs[IDAProIndex++]; + else { + // Last IDA addr; might still process Disasm addrs + // after loop exit in cleanup loop. + break; + } + } + else { + // We must have CurrAddr > CurrDisasmAddr. That means + // IDA has jumped over some code addresses in + // DisasmLocs. We need to try to convert addresses + // to code until we can reach the current addr. + int InstrLen; + // For now, we will address only the case in which IDA + // has identified addresses as data bytes, and the + // external disassembler(e.g. objdump) has identified + // the same addresses as code. We only want to deal with + // contiguous areas of data-to-code conversion that do NOT + // follow a return statement. + int AreaSize = 0; + ea_t AreaStart = CurrDisasmAddr; + ea_t AreaEnd; +#if SMP_DEBUG_FIXUP_IDB + msg("CurrDisasmAddr: %x CurrAddr: %x\n", CurrDisasmAddr, CurrAddr); +#endif + bool SkipArea = FindDataToConvert(IDAProIndex, DisasmIndex, AreaSize); + + bool DebugAddress = (CurrDisasmAddr == 0x806c19a); + if (SkipArea) { + // Skip over the extra external disasm addresses. + while (CurrDisasmAddr < CurrAddr) + CurrDisasmAddr = DisasmLocs[DisasmIndex++]; + } + else { + // Convert the overlooked code region to unexplored. + AreaEnd = CurrDisasmAddr + AreaSize; +#if SMP_DEBUG_FIXUP_IDB + msg("Found data to convert: %x to %x\n", AreaStart, AreaEnd); +#endif + do_unknown_range(AreaStart, AreaSize, DOUNK_SIMPLE); + bool AllConverted = true; + do { + flags_t InstrFlags = getFlags(CurrDisasmAddr); + if (!isUnknown(InstrFlags)) { + msg("Sync problem in FixCodeID: %x\n", CurrDisasmAddr); + } + else { + InstrLen = ua_code(CurrDisasmAddr); + if (InstrLen > 0) { // Successfully converted to code + SMPInstr NewInstr(CurrDisasmAddr); + NewInstr.Analyze(); +#if SMP_DEBUG_FIXUP_IDB + msg("FixCodeID success at %x: len: %d %s\n", CurrDisasmAddr, + InstrLen, NewInstr.GetDisasm()); +#endif + } + else { + if (MDPatchUnconvertedBytes(CurrDisasmAddr)) { + msg(" Patched bytes at %x\n", CurrDisasmAddr); + } + else { + AllConverted = false; + msg("FixCodeID failure at %x\n", CurrDisasmAddr); + } + } + } // end if (isCode(InstrFlags) ... else ... + if (DisasmIndex < DisasmLocs.size()) { + CurrDisasmAddr = DisasmLocs[DisasmIndex++]; + } + else { + // cause loops to exit + CurrDisasmAddr = CurrAddr; + ++DisasmIndex; // skip cleanup loop + } + } while (CurrDisasmAddr < CurrAddr); + if (AllConverted) { + if (IsFunctionPrologue(AreaStart, AreaEnd)) { + // Create a new function entry chunk here. + // **!!** TODO + ; + } + else { + // Extend the previous chunk to include the + // converted code. + ea_t PrevIDAAddr = IDAProLocs[IDAProIndex - 2]; + func_t *PrevChunk = get_fchunk(PrevIDAAddr); +#if SMP_DEBUG_FIXUP_IDB + msg(" addr in chunk to extend: %x\n", PrevIDAAddr); + msg(" func_t pointer for chunk: %x\n", PrevChunk); +#endif +#if 0 // temporary for debugging + if (is_func_entry(PrevChunk)) { + // Extend the func entry to contain the new code. + if (func_setend(PrevIDAAddr, AreaEnd)) { + msg("Func extended to include code from %x to %x\n", + AreaStart, AreaEnd); + ReanalyzeList.push_back(PrevIDAAddr); + } + else { + msg("Failed to extend func from %x to %x\n", + AreaStart, AreaEnd); + } + } + else { // tail + // See if this works for function tails, also. + // Extend the func entry to contain the new code. + if (func_setend(PrevIDAAddr, AreaEnd)) { + msg("Tail extended to include code from %x to %x\n", + AreaStart, AreaEnd); + func_t *TailOwner = get_func(PrevChunk->owner); + ReanalyzeList.push_back(PrevIDAAddr); + } + else { + msg("Failed to extend tail from %x to %x\n", + AreaStart, AreaEnd); + } + } // end if (is_func_entry()) ... else ... +#endif + } // end if (IsFunctionPrologue()) ... else ... + } // end if (AllConverted) + else { + msg("not AllConverted; cannot include new code in previous chunk.\n"); + } + } // end if (SkipArea) ... else ... + } // end if (addr < CurrDisasmAddr) .. else if ... else ... + } // end while (DisasmIndex <= DisasmLocs.size() + +#if 0 // Make this code use FindDataToConvert() **!!** + // Cleanup loop: + // If there are still Disasm addrs to process, try to turn them + // into code in the IDB. + while (DisasmIndex <= DisasmLocs.size()) { + flags_t InstrFlags = getFlags(CurrDisasmAddr); + if (isCode(InstrFlags)) { + msg("Sync problem in FixCodeID: %x\n", CurrDisasmAddr); + } + else { + // Clear bytes to unexplored. + segment_t *seg = getseg(CurrDisasmAddr); + if (SEG_CODE == seg->type) { + do_unknown_range(CurrDisasmAddr, seg->endEA - CurrDisasmAddr, DOUNK_SIMPLE); + } + else { + // Might be safest to just discontinue processing + // if we wander into a non-code segment. + // DisasmLocs should not have an entire code segment + // that IDA Pro missed. + break; + } + int InstrLen = ua_code(CurrDisasmAddr); + if (InstrLen > 0) { // Successfully converted to code + SMPInstr NewInstr(CurrDisasmAddr); + NewInstr.Analyze(); + msg("FixCodeID success at %x: %s\n", CurrDisasmAddr, + NewInstr.GetDisasm()); + } + else { + msg("FixCodeID failure at %x\n", CurrDisasmAddr); + } + } // end if (isCode(InstrFlags) ... else ... + if (DisasmIndex < DisasmLocs.size()) { + CurrDisasmAddr = DisasmLocs[DisasmIndex++]; + } + else { + ++DisasmIndex; // cause loop to exit + } + } // end while (DisasmIndex <= DisasmLocs.size() +#endif + + return; +} // end of FixCodeIdentification() // Audit the IDA code database by looking at all instructions in the // code segment and printing all those that are not contained in a @@ -339,6 +1186,8 @@ void FindOrphanedCode(segment_t *CurrSeg, FILE *AnnotFile) { for (ea_t addr = CurrSeg->startEA; addr < CurrSeg->endEA; addr = get_item_end(addr)) { flags_t InstrFlags = getFlags(addr); + if (isTail(InstrFlags)) + continue; if (isHead(InstrFlags) && isCode(InstrFlags)) { func_t *CurrFunc = get_func(addr); if (NULL == CurrFunc) { @@ -393,10 +1242,12 @@ void AuditCodeTargets(void) { if ((XrefType == fl_U) || (XrefType == fl_USobsolete)) { msg("Bad xref type: %x %s\n", addr, FuncName); } +#if SMP_DEBUG_FIXUP_IDB else if ((XrefType == fl_JF) || (XrefType == fl_JN)) { msg("Jump to func: %x %s from: %x\n", addr, FuncName, xb.from); } +#endif else if (XrefType == fl_F) { msg("Fall through to func: %x %s from: %x\n", addr, FuncName, xb.from); -- GitLab