Skip to content
Snippets Groups Projects
SMPStaticAnalyzer.cpp 98 KiB
Newer Older
//
// SMPStaticAnalyzer.cpp
//
// This plugin performs the static analyses needed for the SMP project
//   (Software Memory Protection).
//

#include <ida.hpp>
#include <idp.hpp>
#include <allins.hpp>
#include <auto.hpp>
#include <bytes.hpp>
#include <funcs.hpp>
#include <intel.hpp>
#include <loader.hpp>
#include <lines.hpp>
#include <name.hpp>
#include <ua.hpp>

#include "SMPStaticAnalyzer.h"
#include "SMPDataFlowAnalysis.h"
#include "SMPFunction.h"
#include "SMPInstr.h"
#include "ProfilerInformation.h"


// Set to 1 for debugging output
#define SMP_DEBUG 1
#define SMP_DEBUG3 0   // verbose
#define SMP_DEBUG_MEM 0 // print memory operands
#define SMP_DEBUG_TYPE0 0 // Output instr info for OptType = 0
#define SMP_DEBUG_CHUNKS 0 // restructuring tail chunks, shared chunks, etc.
#define SMP_DEBUG_DATA_ONLY 0  // Find & fix data addresses in code segments

// Set to 1 when doing a binary search using SMP_DEBUG_COUNT to find
//  which function is causing a problem.
#define SMP_BINARY_DEBUG 0
#define SMP_DEBUG_COUNT 356  // How many funcs to process in problem search
int FuncsProcessed = 0;

#define SMP_FIXUP_IDB 1  // Try to fix the IDA database?
clc5q's avatar
clc5q committed
#define SMP_DEBUG_FIXUP_IDB 0  // debugging output for FixupIDB chain
#define SMP_FIND_ORPHANS 1  // find code outside of functions

// Define optimization categories for instructions.
int OptCategory[NN_last + 1];
// Initialize the OptCategory[] array.
void InitOptCategory(void);

// Keep statistics on how many instructions we saw in each optimization
//  category, and how many optimizing annotations were emitted for
//  each category.
int OptCount[LAST_OPT_CATEGORY + 1];
int AnnotationCount[LAST_OPT_CATEGORY + 1];

// Unique data referent number to use in data annotations.
unsigned long DataReferentID;

// The types of data objects based on their first operand flags.
const char *DataTypes[] = { "VOID", "NUMHEX", "NUMDEC", "CHAR",
		"SEG", "OFFSET", "NUMBIN", "NUMOCT", "ENUM", "FORCED", 
		"STRUCTOFFSET", "STACKVAR", "NUMFLOAT", "UNKNOWN", 
		"UNKNOWN", "UNKNOWN", 0};

// Filename (not including path) of executable being analyzed.
static char RootFileName[MAXSTR];

clc5q's avatar
clc5q committed
// Operand type that can have all fields initialized to o_void and zero
//  values, to be used to copy-initialize operands that we are adding to
//  RTLs and DEF and USE lists.
op_t InitOp; 

// Code addresses identified by a disassembler, such as objdump on
//  Linux. These can be used to improve the code vs. data identification
//  of IDA Pro.
vector<ea_t> DisasmLocs;
// Code addresses as identified by IDA Pro, to be compared to DisasmLocs.
vector<ea_t> IDAProLocs;

// Function start and end addresses (for function entry chunks only).
//  Kept here because IDA Pro 5.1 seems to have a memory overwriting
//  problem when iterating through all functions in the program. An existing
//  func_t *ChunkInfo data structure was getting overwritten by one of the 
//  function func_t data structures, causing changes of startEA and endEA among
//  other things.
struct SMP_bounds_t {
	ea_t startEA;
	ea_t endEA;
};

vector<SMP_bounds_t> FuncBounds;

// List of functions that need to be reanalyzed after all the code fixup
//  and code discovery is complete. Kept as a list of addresses; any address
//  within the function is good enough to designate it.
list<ea_t> FuncReanalyzeList;

// A code region that has been converted from data but has code addresses that
//  need to be reanalyzed. This is usually because a former data address is
//  now a jump to a code target that is still a data address. We have to wait
//  until the target has become code before IDA will accept the jump as valid.
class FixupRegion {
public:
	FixupRegion(SMP_bounds_t);
	inline ea_t GetStart(void) const { return CodeRegion.startEA; };
	inline ea_t GetEnd(void) const { return CodeRegion.endEA; };
	inline void SetStart(ea_t addr) { CodeRegion.startEA = addr; };
	list<ea_t> FixupInstrs; // easier to expose than to encapsulate
private:
	SMP_bounds_t CodeRegion;
};

FixupRegion::FixupRegion(SMP_bounds_t Range) {
	this->CodeRegion = Range;
	return;
}

// List of code regions that were not completely analysed because of jump to
//  data considerations.
list<FixupRegion> CodeReanalyzeList;

// Functions for diagnosing and/or fixing problems in the IDA database.
void FixupIDB(void);  // Driver for all other fixing functions.
void FindDataInCode(void);
void AuditTailChunkOwnership(void);
void FindOrphanedCode(segment_t *, FILE *);
void AuditCodeTargets(void);
ea_t FindNewFuncLimit(ea_t);
void SpecialDebugOutput(void);

static int idaapi idp_callback(void *, int event_id, va_list va) {
	if (event_id == ph.auto_empty_finally) {   // IDA analysis is done
		IDAP_run(0);
		qexit(0);
	}
	return 0;
}

int IDAP_init(void) {
#if 0 // We are now calling from the SMP.idc script.
	// Skip this plugin if it was not specified by the user on the
	//  command line.
	if (get_plugin_options("SMPStaticAnalyzer") == NULL) {
		msg("IDAP_init point 2.\n");
		return PLUGIN_SKIP;
	}
#endif
	// Ensure correct working environment.
	if ((inf.filetype != f_ELF) && (inf.filetype != f_PE)) {
		error("Executable format must be PE or ELF.");
		return PLUGIN_SKIP;
	}
	if (ph.id != PLFM_386) {
		error("Processor must be x86.");
 		return PLUGIN_SKIP;
	}
	hook_to_notification_point(HT_IDP, idp_callback, NULL);
    InitOptCategory();
	InitDFACategory();
	InitTypeCategory();
	InitSMPDefsFlags();
	InitSMPUsesFlags();
clc5q's avatar
clc5q committed
	InitOp.type = o_void;
	InitOp.addr = 0;
	InitOp.dtyp = dt_dword;
	InitOp.flags = 0;
	InitOp.n = 0;
	InitOp.offb = 0;
	InitOp.offo = 0;
	InitOp.reg = R_none;
	InitOp.specflag1 = 0;
	InitOp.specflag2 = 0;
	InitOp.specflag3 = 0;
	InitOp.specflag4 = 0;
	InitOp.specval = 0;
	InitOp.value = 0;
	return PLUGIN_KEEP;
} // end of IDAP_init

void IDAP_term(void) {
	unhook_from_notification_point(HT_IDP, idp_callback, NULL);
	return;
}

void IDAP_run(int arg) {
	segment_t *seg;
	FILE *SymsFile;
#if SMP_DEBUG
	msg("Beginning IDAP_run.\n");
#endif
	// Open the output file.
	ssize_t FileLen;
	FileLen = get_root_filename(RootFileName, sizeof(RootFileName) - 1);
	string SymsFileName(RootFileName);
	string FileSuffix(".annot");
	SymsFileName += FileSuffix;
	SMPProgram *CurrProg = new SMPProgram();
	CurrProg->AnalyzeData(); // Analyze static data in the executable

	// read the Profiler generated information into a new prof_info class 
	ProfilerInformation *prof_info = new ProfilerInformation(SymsFileName.c_str(), CurrProg);
	SymsFile = qfopen(SymsFileName.c_str(), "w");
clc5q's avatar
clc5q committed
		error("FATAL ERROR: Cannot open output file %s\n", SymsFileName.c_str());
		return;
	}

	(void) memset(OptCount, 0, sizeof(OptCount));
	(void) memset(AnnotationCount, 0, sizeof(AnnotationCount));

	// Record the start and end addresses for all function entry
	//  chunks in the program.
	FuncBounds.reserve(10 + get_func_qty());
	for (size_t FuncIndex = 0; FuncIndex < get_func_qty(); ++FuncIndex) {
		func_t *FuncInfo = getn_func(FuncIndex);
		SMP_bounds_t temp;
		temp.startEA = FuncInfo->startEA;
		temp.endEA = FuncInfo->endEA;
		FuncBounds.push_back(temp);
	}

#if SMP_DEBUG_DATA_ONLY
	FindDataInCode();
	FixCodeIdentification();
	qfclose(SymsFile);
	return;
#endif

	// Pre-audit the IDA database by seeing if the distinction
	//  between code and data can be improved, and if all branches
	//  and calls have proper code targets and code cross references.
#if SMP_FIXUP_IDB
	FixupIDB();
#endif
	msg("Calling InferDataGranularity\n");
	msg("ptr to MemoryAccessInfo: %x\n", prof_info->GetMemoryAccessInfo());
	prof_info->GetMemoryAccessInfo()->InferDataGranularity();
	msg("Returned from InferDataGranularity\n");
clc5q's avatar
clc5q committed
#if SMP_FIND_ORPHANS
	for (int SegIndex = 0; SegIndex < get_segm_qty(); ++SegIndex) {
		seg = getnseg(SegIndex);
		if (seg->type == SEG_CODE)
			FindOrphanedCode(seg, SymsFile);
	for (int OptType = 0; OptType <= LAST_OPT_CATEGORY; ++OptType) {
		msg("Optimization Category Count %d:  %d Annotations: %d\n",
			OptType, OptCount[OptType], AnnotationCount[OptType]);
	}

	qfclose(SymsFile);
	return;
} // end IDAP_run()

char IDAP_comment[] = "UVa SMP/NICECAP Project";
char IDAP_help[] = "Good luck";
char IDAP_name[] = "SMPStaticAnalyzer";
char IDAP_hotkey[] = "Alt-J";

plugin_t PLUGIN = {
	IDP_INTERFACE_VERSION,
	0,
	IDAP_init,
	IDAP_term,
	IDAP_run,
	IDAP_comment,
	IDAP_help,
	IDAP_name,
	IDAP_hotkey
};

// Find all code addresses in the IDA database and enter them into
//  IDAProLocs. Find all code addresses identified by the external
//  disassembler (e.g. objdump) and enter them into DisasmLocs.
void FindCodeAddresses(void) {
	// Read in code addresses as found by an external disassembler.
	ea_t CurrDisasmAddr;
	string DisasmFileName(RootFileName);
	string FileSuffix(".SMPobjdump");
	DisasmFileName += FileSuffix;
	FILE *DisasmFile = qfopen(DisasmFileName.c_str(), "r");
	if (NULL == DisasmFile) {
		error("FATAL: Cannot open input file %s\n", DisasmFileName.c_str());
		return;
	}

#define DISASM_RESERVE_SIZE  50000
	DisasmLocs.reserve(DISASM_RESERVE_SIZE);
	int ScanReturn = qfscanf(DisasmFile, "%x", &CurrDisasmAddr);
	while (1 == ScanReturn) {
		int NextChar;
		DisasmLocs.push_back(CurrDisasmAddr);
		// Swallow the rest of the input line and get the next address.
		do {
			NextChar = qfgetc(DisasmFile);
		} while ((EOF != NextChar) && ('\n' != NextChar));
		ScanReturn = qfscanf(DisasmFile, "%x", &CurrDisasmAddr);
	} // end while (1 == ScanReturn)
	if (0 >= DisasmLocs.size()) {
		msg("ERROR: No addresses read from %s\n", DisasmFileName.c_str());
		qfclose(DisasmFile);
		return;
	}
	else {
		msg("%d Disasm addresses read from %s\n", DisasmLocs.size(),
			DisasmFileName.c_str());
		qfclose(DisasmFile);
	}

	// Find all the code locs in the IDA Pro database. As we find
	//  them, store them in IDAProLocs.
	for (int SegIndex = 0; SegIndex < get_segm_qty(); ++SegIndex) {
		segment_t *seg = getnseg(SegIndex);
		if (SEG_CODE != seg->type)
			continue;

		for (ea_t addr = seg->startEA; addr < seg->endEA; addr = get_item_end(addr)) {
			flags_t InstrFlags = getFlags(addr);
			if (isHead(InstrFlags) && isCode(InstrFlags)) {
				IDAProLocs.push_back(addr);
clc5q's avatar
clc5q committed
#if 0
				if ((0x806cda4 <= addr) && (0x806cf99 >= addr))
					msg("IDA code addr: %x\n", addr);
clc5q's avatar
clc5q committed
#endif
			} // end if (isHead(addr) && isCode(addr)
#if SMP_DEBUG_FIXUP_IDB
			else if ((0x806cda4 <= addr) && (0x806cf99 >= addr)) {
				if (!isHead(InstrFlags))
					msg("Weirdness: not isHead at %x\n", addr);
				if (isUnknown(InstrFlags)) {
					msg("Weirdness: isUnknown at %x\n", addr);
				}
			}
#endif
		} // end for (ea_t addr = seg->startEA; ...)
	} // end for (int SegIndex = 0; ...)
	return;
} // end FindCodeAddresses()

// Return true if addr is not a proper beginning address for an instruction.
// Return false otherwise.
// Currently, we claim that an instruction is misaligned if DisasmLocs does
//  not contain it. This function is useful for dealing with errors in IDA
//  code identification, in which a large code section is identified as data,
//  but some instructions in the middle of the "data" are identified as
//  code but IDA often starts on the wrong boundary in these cases.
bool IsCodeMisaligned(ea_t addr) {
	// Do a binary search for addr within DisasmLocs, which is sorted
	//  in ascending address order because of the way in which it was
	//  generated.
	size_t min = 0;
	size_t max = DisasmLocs.size();  // don't access DisasmLocs[max]
	size_t index = (min + max) / 2;
	
	while (addr != DisasmLocs[index]) {
		if (min >= (max - 1))
			return true;
#if 0
		msg("min: %d max: %d index: %d\n", min, max, index);
#endif
		if (addr < DisasmLocs[index])
			max = index;
		else // must be addr > DisasmLocs[index];
			min = index;
	
		index = (min + max) / 2;
	}

	return false;
} // end of IsCodeMisaligned()

void RemoveIDACodeAddr(ea_t addr) {
	// Do a binary search for addr within IDAProLocs, which is sorted
	//  in ascending address order because of the way in which it was
	//  generated. Delete the element of IDAProLocs if found.
	size_t min = 0;
	size_t max = IDAProLocs.size();  // don't access IDAProLocs[max]
	size_t index = (min + max) / 2;
	
	while (addr != IDAProLocs[index]) {
		if (min >= (max - 1))
			return;
#if 0
		msg("min: %d max: %d index: %d\n", min, max, index);
#endif
		if (addr < IDAProLocs[index])
			max = index;
		else // must be addr > IDAProLocs[index];
			min = index;
	
		index = (min + max) / 2;
	}

	// IDAProLocs[index] contains addr.
	vector<ea_t>::iterator RemovalIterator = IDAProLocs.begin();
	RemovalIterator += index;
	RemovalIterator = IDAProLocs.erase(RemovalIterator);
	return;
} // end of RemoveIDACodeAddr()

// Driver for all other fixing functions. Upon its return, the IDA
//  database (IDB file) should be fixed up as much as we can fix it.
void FixupIDB(void) {
	FindCodeAddresses();
#if SMP_DEBUG_FIXUP_IDB
	SpecialDebugOutput();
#endif
	AuditCodeTargets();
	FindDataInCode();
	AuditTailChunkOwnership();
	FixCodeIdentification();
	int fixes = FixupNewCodeChunks();
#if SMP_DEBUG_FIXUP_IDB
	SpecialDebugOutput();
#endif
} // end of FixupIDB()

// Find and print all data head addresses in code segments. 
// If an isolated code instruction is found in the midst of a run
//  of data bytes and has no code xrefs jumping to it, it is not
//  reachable as code and is undoubtedly a mixup by IDA. Possibly
//  the whole data region will be converted to code later, in which
//  case the isolated code is not necessarily properly aligned and
//  parsed at its present address, so we are glad to convert it into
//  data anyway so that FindDataToConvert() will succeed on it later.
// Data to code conversion, and isolated code detection, are inhibited
//  by IDA identifying several consecutive instructions in the midst
//  of a data region, with the code addresses not agreeing with the
//  external disassembler's code addresses. We will convert these
//  misaligned instructions to data as we detect them. We will also
//  convert unexplored bytes (isUnknown(flags) == true) into data if
//  they are in the midst of a data sequence.
#define MIN_DATARUN_LEN 24  // #bytes on either side of "isolated" code
void FindDataInCode(void) {
	size_t DataRunLen = 0; // How many data bytes in a row have we seen?
	bool IsolatedCodeTrigger = false; // Have seen data, then isolated code
									// Now looking for data
	ea_t IsolatedCodeAddr;
	int IsolatedCodeLen;
	int InstrLen;

	for (int SegIndex = 0; SegIndex < get_segm_qty(); ++SegIndex) {
		char SegName[MAXSTR];
		segment_t *seg = getnseg(SegIndex);
		ssize_t SegNameSize = get_segm_name(seg, SegName, sizeof(SegName) - 1);
		if (SEG_CODE != seg->type)
			continue;
#if SMP_DEBUG_FIXUP_IDB
		msg("Non-code addresses for code segment %s from %x to %x\n",
			SegName, seg->startEA, seg->endEA);
#endif
		for (ea_t addr = seg->startEA; addr < seg->endEA; addr = get_item_end(addr)) {
			flags_t AddrFlags = getFlags(addr);
			if (isHead(AddrFlags)) {
				if (isData(AddrFlags)) {
					DataRunLen += get_item_size(addr);
#if SMP_DEBUG_FIXUP_IDB
					msg("Data: %x\n", addr);
#endif
					if (MIN_DATARUN_LEN <= DataRunLen) {
						if (IsolatedCodeTrigger) {
							// Saw data, then one isolated code, then data
							do_unknown_range(IsolatedCodeAddr, IsolatedCodeLen, DOUNK_SIMPLE);
							RemoveIDACodeAddr(IsolatedCodeAddr);
							if (do_data_ex(IsolatedCodeAddr, byteflag(),
								IsolatedCodeLen, BADNODE)) {
									msg("Converted isolated code to data: %x\n",
										IsolatedCodeAddr);
							}
							else {
								msg("Failed to convert isolated code to data: %x len: %x\n",
									IsolatedCodeAddr, IsolatedCodeLen);
							}
							IsolatedCodeTrigger = false;
						} // end if (IsolatedCodeTrigger)
					} // end if (MIN_DATARUN_LEN <= DataRunLen)
				} // end if (isData(AddrFlags)
				else if (isUnknown(AddrFlags)) {
					// Just in case; unknown usually means not head or tail
					// If in a data run, convert to data.
					InstrLen = get_item_size(addr);
clc5q's avatar
clc5q committed
#if SMP_DEBUG_FIXUP_IDB
					msg("Unknown: %x len: %x\n", addr, InstrLen);
clc5q's avatar
clc5q committed
#endif
					if (0 < DataRunLen) {
						if (do_data_ex(addr, byteflag(), InstrLen, BADNODE)) {
clc5q's avatar
clc5q committed
#if SMP_DEBUG_FIXUP_IDB
							msg("Converted unknown to data at %x len: %x\n", addr, InstrLen);
clc5q's avatar
clc5q committed
#endif
clc5q's avatar
clc5q committed
#if SMP_DEBUG_FIXUP_IDB
							msg("Failed to convert unknown to data at %x len: %x\n", addr, InstrLen);
clc5q's avatar
clc5q committed
#endif
							DataRunLen = 0;
							IsolatedCodeTrigger = false;
						}
					}
				}
				else if (isCode(AddrFlags)) {  // must be true
					if (MIN_DATARUN_LEN <= DataRunLen) {
clc5q's avatar
clc5q committed
#if SMP_DEBUG_FIXUP_IDB
						msg("DataRunLen: %d at %x\n", DataRunLen, addr);
clc5q's avatar
clc5q committed
#endif
						InstrLen = ua_ana0(addr);
#if SMP_DEBUG_FIXUP_IDB
						msg("Calling IsCodeMisaligned: len %d\n", InstrLen);
#endif
						if (IsCodeMisaligned(addr)) {
#if SMP_DEBUG_FIXUP_IDB
							msg("Code was misaligned.\n");
#endif
							do_unknown_range(addr, InstrLen, DOUNK_SIMPLE);
							RemoveIDACodeAddr(addr);
							if (do_data_ex(addr, byteflag(), InstrLen, BADNODE)) {
clc5q's avatar
clc5q committed
#if SMP_DEBUG_FIXUP_IDB
								msg("Converted misaligned code to data at %x : len: %x\n",
									addr, InstrLen);
clc5q's avatar
clc5q committed
#endif
								// Step back so data gets processed.
								DataRunLen += get_item_size(addr);
								continue; // skip reset of DataRunLen
							}
							else {
clc5q's avatar
clc5q committed
#if SMP_DEBUG_FIXUP_IDB
								msg("Misaligned code left as unknown at %x : len: %x\n",
									addr, InstrLen);
clc5q's avatar
clc5q committed
#endif
								IsolatedCodeTrigger = false;
							}
						} // end if (IsCodeMisaligned() ...)
						else if (!hasRef(AddrFlags)) {
							// No references at all --> isolated code.
							IsolatedCodeTrigger = true;
							IsolatedCodeAddr = addr;
							IsolatedCodeLen = InstrLen;
						}
						else {
							xrefblk_t xb;
							bool ok = xb.first_to(IsolatedCodeAddr, XREF_ALL);
							if (!ok) {
								// No code xrefs to this target addr.
								IsolatedCodeTrigger = true;
								IsolatedCodeAddr = addr;
								IsolatedCodeLen = InstrLen;
							}
						}
					} // end if (MIN_DATARUN_LEN <= DataRunLen)
					else if (IsolatedCodeTrigger) {
						// Two instructions in a row does not fit the pattern.
						IsolatedCodeTrigger = false;
					}
					DataRunLen = 0;
				} // end if (isData) ... else if (isUnknown) ... else isCode
			} // end if (isHead)
			else if (isUnknown(AddrFlags)) {
				// If in a data run, convert to data.
				InstrLen = get_item_size(addr);
clc5q's avatar
clc5q committed
#if SMP_DEBUG_FIXUP_IDB
				msg("Unknown: %x len: %x\n", addr, InstrLen);
clc5q's avatar
clc5q committed
#endif
				if (0 < DataRunLen) {
					if (do_data_ex(addr, byteflag(), InstrLen, BADNODE)) {
clc5q's avatar
clc5q committed
#if SMP_DEBUG_FIXUP_IDB
						msg("Converted unknown to data at %x len: %x\n", addr, InstrLen);
clc5q's avatar
clc5q committed
#endif
clc5q's avatar
clc5q committed
#if SMP_DEBUG_FIXUP_IDB
						msg("Failed to convert unknown to data at %x len: %x\n", addr, InstrLen);
clc5q's avatar
clc5q committed
#endif
						DataRunLen = 0;
						IsolatedCodeTrigger = false;
					}
				}
			}
		} // end for (ea_t addr =  seg->startEA; ...)
	} // end for (int SegIndex = 0; ...)
	return;
} // end of FindDataInCode()


// The choices that IDA makes for deciding which parent function of a
//  TAIL chunk is the primary owner of the tail can be counterintuitive.
//  A function entry can both fall into and jump to a tail chunk that
//  is contiguous with it, yet the "owner" might be a function that is
//  far below it in the executable address space. This function will
//  change the ownership to a more sensible arrangement.
void AuditTailChunkOwnership(void) {
	char FuncName[MAXSTR];
	// Iterate through all chunks in the program.
	for (size_t ChunkIndex = 0; ChunkIndex < get_fchunk_qty(); ++ChunkIndex) {
		func_t *ChunkInfo = getn_fchunk((int) ChunkIndex);
		if (is_func_tail(ChunkInfo)) {
			// For each TAIL chunk, find all the parent chunks. Find the last
			//  parent chunk with an address less than the TAIL chunk address.
			ea_t BestCandidate = 0;
			func_parent_iterator_t FuncParent(ChunkInfo);
#if SMP_DEBUG_CHUNKS
			msg("Tail chunk: %x ", ChunkInfo->startEA);
#endif
			for (bool ok = FuncParent.first(); ok; ok = FuncParent.next()) {
				ea_t parent = FuncParent.parent();
#if SMP_DEBUG_CHUNKS
				msg(" parent: %x ", parent);
#endif
				if ((parent > BestCandidate) && (parent < ChunkInfo->startEA))
					BestCandidate = parent;
			}
#if SMP_DEBUG_CHUNKS
			msg("\n");
#endif
			//  Make the best parent chunk the owner of the TAIL chunk if it is
			//  not already the owner.
			if (ChunkInfo->owner != BestCandidate) {
				if (0 < BestCandidate) {
					if (set_tail_owner(ChunkInfo, BestCandidate)) {
						func_t *FuncInfo = get_func(BestCandidate);
						msg("Set %x as new owner of tail %x\n",
							BestCandidate, ChunkInfo->startEA);
						// Reanalyze the parent function (and all its
						//  tail chunks) now that the structure has changed.
						reanalyze_function(FuncInfo);
					}
					else {
						msg("set_tail_owner failed for tail %x and parent %x\n",
							ChunkInfo->startEA, BestCandidate);
					}
				}
				else {
					func_t *FuncInfo = get_func(ChunkInfo->owner);
					get_func_name(FuncInfo->startEA, FuncName, sizeof(FuncName) - 1);
#if SMP_DEBUG_CHUNKS
					msg("No good parent candidate before tail at %x\n",
						ChunkInfo->startEA);
					msg("Current parent is %x: %s\n", FuncInfo->startEA, FuncName);
#endif
					// Find out if a function entry chunk that comes before the
					//  tail is a better candidate for the owner (i.e. it falls
					//  through to the tail, or jumps to it).
					BestCandidate = 0;
#if SMP_DEBUG_CHUNKS
					msg("Finding parent func candidates for %x:", ChunkInfo->startEA);
#endif
					SMP_bounds_t CurrFunc;
					for (size_t FuncIndex = 0; FuncIndex < FuncBounds.size(); ++FuncIndex) {
						CurrFunc = FuncBounds[FuncIndex];
						if ((CurrFunc.startEA < ChunkInfo->startEA)
							&& (CurrFunc.startEA > BestCandidate)) {
							BestCandidate = CurrFunc.startEA;
#if SMP_DEBUG_CHUNKS
							msg(" candidate: %x tail: %x", BestCandidate,
								ChunkInfo->startEA);
#endif
						}
						else {
#if SMP_DEBUG_CHUNKS
							msg(" not a candidate: %x tail: %x best: %x\n",
								CurrFunc.startEA, ChunkInfo->startEA, BestCandidate);
#endif
							break;
						}
					} // end for (size_t FuncIndex = 0; ...)
					if (0 >= BestCandidate) { // highly unlikely
						msg("No good func entry parent candidate.\n");
					}
					else {
						FuncInfo = get_func(BestCandidate);
						get_func_name(FuncInfo->startEA, FuncName, sizeof(FuncName) - 1);
#if SMP_DEBUG_CHUNKS
						msg("Best func entry parent candidate: %s at %x",
							FuncName, BestCandidate);
						if (FuncInfo->endEA == ChunkInfo->startEA)
							msg(" Function endEA == tail chunk startEA");
						msg("\n");
#endif
					}
				}
			} // end if (ChunkInfo->owner != BestCandidate)
#if SMP_DEBUG_CHUNKS
			else {
				msg("Already best parent for %x is %x\n", ChunkInfo->startEA,
					ChunkInfo->owner);
			}
#endif
		} // end if (is_func_tail(ChunkInfo))
	} // end for (size_t ChunkIndex = 0; ...)

	return;
} // end of AuditTailChunkOwnership()

// If the addresses signified from DisasmIndex to IDAProIndex are
//  all considered data and do NOT follow a return instruction,
//  return false and update AreaSize to reflect the area to be
//  converted.
// Return value: true -> skip to IDAProIndex; false -> convert AreaSize bytes.
bool FindDataToConvert(size_t IDAProIndex, size_t DisasmIndex, int &AreaSize) {
	ea_t PrevIDAAddr;
	ea_t NextIDAAddr;
	size_t ShadowDisasmIndex = DisasmIndex - 1;
	ea_t DisasmAddr = DisasmLocs[ShadowDisasmIndex];
	bool CannotConvert = false;  // return value
	bool DebugAddress = false;
#if SMP_DEBUG_FIXUP_IDB
	DebugAddress = (DisasmAddr == 0x806c19a);
#endif

	if (DebugAddress) {
		msg("IDAProIndex: %d DisasmIndex: %d\n", IDAProIndex, DisasmIndex);
		msg("IDA locs size %d Disasm locs size %d\n", IDAProLocs.size(),
			DisasmLocs.size());
	}
	if (IDAProIndex >= IDAProLocs.size()) {
		// Have already processed the last IDA address.
		if (DebugAddress) msg(" Already done with IDAProLocs.\n");
		return true;
	}
	else if (DisasmIndex >= DisasmLocs.size()) {
		// Strange. Last Disasm address is only one to convert, and
		//  IDA still has addresses after that?
		if (DebugAddress) msg(" Already done with DisasmLocs.\n");
		return true;
	}
	else if (IDAProIndex < 2) {
		// We have Disasm addrs before the very first IDA addr. We
		//  don't trust this boundary case.
		if (DebugAddress) msg(" Boundary case with IDAProLocs.\n");
		return true;
	}
	NextIDAAddr = IDAProLocs[IDAProIndex - 1];
	PrevIDAAddr = IDAProLocs[IDAProIndex - 2];
	if (DebugAddress) msg(" PrevIDAAddr: %x NextIDAAddr: %x\n", PrevIDAAddr, NextIDAAddr);

	// See if previous IDA address was a return.
	flags_t PrevFlags = getFlags(PrevIDAAddr);
	if (!isCode(PrevFlags) || !isHead(PrevFlags)) {
		msg("PrevIDAAddr %x not isCode or not isHead.\n", PrevIDAAddr);
		return true;
	}
	SMPInstr PrevInstr(PrevIDAAddr);
	PrevInstr.Analyze();
	if (DebugAddress) msg("Finished PrevInstr.Analyze()\n");
	if (PrevInstr.MDIsReturnInstr()) {
		// Right after a return come no-ops and 2-byte no-ops
		//  that are just for alignment. IDA does not seem to be
		//  happy when we convert all those to code.
		if (DebugAddress) msg(" Data followed a return instruction.\n");
		return true;
	}
	// Now, see if the area from DisasmAddr to NextIDAAddr is all data
	//  according to IDA.
	while (DisasmAddr < NextIDAAddr) {
		flags_t DataFlags = getFlags(DisasmAddr);
		if (isTail(DataFlags)) {
			if (DebugAddress) msg(" tail byte: %x\n", DisasmAddr);
			DisasmAddr = get_item_end(DisasmAddr);
		}
		else if (isData(DataFlags)) {
			if (DebugAddress) msg(" data byte: %x\n", DisasmAddr);
			DisasmAddr = get_item_end(DisasmAddr);
		}
		else if (isCode(DataFlags)) {
			// How could this ever happen?
			if (DebugAddress) msg(" isCode: %x\n", DisasmAddr);
			return true;
		}
		else { // must be isUnknown()
			// Very conservative here; only want to convert when the whole
			//  region is data, because that is a symptom of IDA missing
			//  a piece of code within a function (usually a piece of code
			//  that is only reachable via an indirect jump).
			if (DebugAddress) msg(" Not isData: %x\n", DisasmAddr);
			return true;
		}
		if (DebugAddress) msg(" new DisasmAddr: %x\n", DisasmAddr);
	} // end while (DisasmAddr < NextIDAAddr)
	if (DebugAddress) msg(" loop exit CannotConvert: %d\n", CannotConvert);
	if (!CannotConvert) {
		// Success.
		DisasmAddr = DisasmLocs[ShadowDisasmIndex];
		AreaSize = NextIDAAddr - DisasmAddr;
		if (DebugAddress) { 
			msg(" Success! AreaSize: %x Old index: %d new index: %d\n",
				AreaSize, ShadowDisasmIndex, DisasmIndex);
			msg(" exiting FindDataToConvert()\n");
			msg("\n");
		}
	} // end if (!CannotConvert)
	return CannotConvert;
} // end of FindDataToConvert()

// Does a converted code region look like a function prologue? If so,
//  we should not include it in the previous function.
bool IsFunctionPrologue(ea_t StartAddr, ea_t EndAddr) {
	return false;  // **!!** TODO 
} // end of IsFunctionPrologue()

// Patch program bytes that could not be converted from
//  data to code, if it can be determined that the bytes represent code
//  that IDA has a hard time with.
// Currently limited to finding "call near ptr 0" instructions, which
//  often are found in optimized glibc code because gcc was able to
//  determine that a function pointer was zero and did constant propagation,
//  but unfortunately was not able to determine that the code was unreachable.
//  IDA will not succeed in ua_code() for "call 0", but there is no danger
//  of a working program ever executing this code. Replacing the call with
//  no-ops permits us to continue converting a contiguous range of data to
//  code, and permits IDA to reanalyze the function later.
// Returns true if program bytes were patched.
bool MDPatchUnconvertedBytes(ea_t CurrDisasmAddr) {
	flags_t AddrFlags = getFlags(CurrDisasmAddr);
	if (isData(AddrFlags) || isTail(AddrFlags)) {
		// Bytes should have been converted to unknown already.
clc5q's avatar
clc5q committed
#if SMP_DEBUG_FIXUP_IDB
		msg("Cannot patch data bytes or tail bytes at %x\n", CurrDisasmAddr);
clc5q's avatar
clc5q committed
#endif
		return false;
	}
	SMPInstr PatchInstr(CurrDisasmAddr);
	PatchInstr.Analyze();
	int InstrLen = PatchInstr.GetCmd().size;
	if (0 >= InstrLen) {
clc5q's avatar
clc5q committed
#if SMP_DEBUG_FIXUP_IDB
		msg("ua_ana0() failed on patch location %x\n", CurrDisasmAddr);
clc5q's avatar
clc5q committed
#endif
		return false;
	}
	else {
		if (PatchInstr.GetCmd().itype != NN_call) {
clc5q's avatar
clc5q committed
#if SMP_DEBUG_FIXUP_IDB
			msg("Cannot patch non-call instruction at %x\n", CurrDisasmAddr);
clc5q's avatar
clc5q committed
#endif
		op_t CallDest = PatchInstr.GetFirstUse()->GetOp();
		if ((o_near != CallDest.type) || (0 != CallDest.addr)) {
clc5q's avatar
clc5q committed
#if SMP_DEBUG_FIXUP_IDB
			msg("Cannot patch call unless it is call near ptr 0 at %x",
				CurrDisasmAddr);
clc5q's avatar
clc5q committed
#endif
			return false;
		}
		ea_t PatchAddr = CurrDisasmAddr;
		for (int i = 0; i < InstrLen; ++i) {
			bool ok = patch_byte(PatchAddr, 0x90);  // x86 no-op
			if (!ok) {
clc5q's avatar
clc5q committed
#if SMP_DEBUG_FIXUP_IDB
				msg("patch_byte() failed at %x\n", PatchAddr);
clc5q's avatar
clc5q committed
#endif
clc5q's avatar
clc5q committed
#if SMP_DEBUG_FIXUP_IDB
		msg("Patched %d bytes successfully at %x\n", InstrLen, CurrDisasmAddr);
clc5q's avatar
clc5q committed
#endif
		InstrLen = ua_code(CurrDisasmAddr);
		if (0 >= InstrLen) {
clc5q's avatar
clc5q committed
#if SMP_DEBUG_FIXUP_IDB
clc5q's avatar
clc5q committed
#endif
			return false;
		}
	} // end if (0 >= InstrLen) ... else ...
	return true;
} // end of MDPatchUnconvertedBytes()

// Use the lists of code addresses identified by IDA Pro (in IDAProLocs)
//  and an external disassembler (in DisasmLocs). Compare the lists and
//  try to convert addresses to code that are found in DisasmLocs but
//  not in IDAProLocs. Emit warnings when IDAProLocs has a code address
//  not found in DisasmLocs.
void FixCodeIdentification(void) {
	size_t DisasmIndex = 0;
	ea_t CurrDisasmAddr = DisasmLocs[DisasmIndex++];
	size_t IDAProIndex = 0;
	ea_t CurrAddr = IDAProLocs[IDAProIndex++];

	while (DisasmIndex <= DisasmLocs.size()) {
		// If the current address is less than the current
		//  external disasm address, we have the rare case in
		//  which IDA Pro has identified an address as code
		//  but the external disasm has not. Emit a warning
		//  message and go on to the next IDA address.
		if (CurrAddr < CurrDisasmAddr) {
			SMPInstr TempInstr(CurrAddr);
			TempInstr.Analyze();
			msg("Address %x is code in IDB but not in external disassembler: %s\n",
				CurrAddr, TempInstr.GetDisasm());
			if (IDAProIndex < IDAProLocs.size())
				CurrAddr = IDAProLocs[IDAProIndex++];
			else {
				// Last IDA addr; might still process Disasm addrs
				//  after loop exit.
				break;
			}
		}
		else if (CurrAddr == CurrDisasmAddr) {
			// If equal, no problem, we are moving through the
			//  code addresses in lockstep. Grab the next address
			//  from each source.
			if (DisasmIndex < DisasmLocs.size()) {
				CurrDisasmAddr = DisasmLocs[DisasmIndex++];
			}
			else {
				++DisasmIndex;  // cause loop exit; skip cleanup loop
			}
			if (IDAProIndex < IDAProLocs.size())
				CurrAddr = IDAProLocs[IDAProIndex++];
			else {
				// Last IDA addr; might still process Disasm addrs
				//  after loop exit in cleanup loop.
				break;
			}
		}
		else {
			// We must have CurrAddr > CurrDisasmAddr. That means
			//  IDA has jumped over some code addresses in
			//  DisasmLocs. We need to try to convert addresses
			//  to code until we can reach the current addr.
			int InstrLen;
			// For now, we will address only the case in which IDA
			//  has identified addresses as data bytes, and the
			//  external disassembler(e.g. objdump) has identified
			//  the same addresses as code. We only want to deal with
			//  contiguous areas of data-to-code conversion that do NOT
			//  follow a return statement.
			int AreaSize = 0;
			ea_t AreaStart = CurrDisasmAddr;
			ea_t AreaEnd;
#if SMP_DEBUG_FIXUP_IDB
			msg("CurrDisasmAddr: %x  CurrAddr: %x\n", CurrDisasmAddr, CurrAddr);
#endif
			bool SkipArea = FindDataToConvert(IDAProIndex, DisasmIndex, AreaSize);
			if (SkipArea) {
				// Skip over the extra external disasm addresses.
				while (CurrDisasmAddr < CurrAddr)
					CurrDisasmAddr = DisasmLocs[DisasmIndex++];
			}
			else { 
				// Convert the overlooked code region to unexplored.
				AreaEnd = CurrDisasmAddr + AreaSize;
#if SMP_DEBUG_FIXUP_IDB
				msg("Found data to convert: %x to %x\n", AreaStart, AreaEnd);
#endif
				do_unknown_range(AreaStart, AreaSize, DOUNK_SIMPLE);
				SMP_bounds_t ConvertRegion;
				ConvertRegion.startEA = AreaStart;
				ConvertRegion.endEA = AreaEnd;
				FixupRegion CurrRegion(ConvertRegion);
				CodeReanalyzeList.push_back(CurrRegion);
				do {
					flags_t InstrFlags = getFlags(CurrDisasmAddr);
					if (!isUnknown(InstrFlags)) {
						msg("Sync problem in FixCodeID: %x\n", CurrDisasmAddr);
					}
					else {
						InstrLen = ua_code(CurrDisasmAddr);
						if (InstrLen > 0) { // Successfully converted to code
							SMPInstr NewInstr(CurrDisasmAddr);
							NewInstr.Analyze();
							if (!NewInstr.MDIsNop())
								AllNops = false;
							msg("FixCodeID success at %x: len: %d %s\n", CurrDisasmAddr,
									InstrLen, NewInstr.GetDisasm());