From f46ffa33bac36c0608604d9c6feec24f08f8c5dd Mon Sep 17 00:00:00 2001
From: Anh <zenpoems@gmail.com>
Date: Wed, 3 Jul 2019 15:23:14 +0000
Subject: [PATCH] made backup_until includable as needed it in other places

---
 irdb-libs/ir_builders/back_search.hpp      | 167 +++++++++++++++++++++
 irdb-libs/ir_builders/fill_in_cfg.cpp      |  78 ++++++++--
 irdb-libs/ir_builders/fill_in_indtargs.cpp |   8 +-
 3 files changed, 238 insertions(+), 15 deletions(-)
 create mode 100644 irdb-libs/ir_builders/back_search.hpp

diff --git a/irdb-libs/ir_builders/back_search.hpp b/irdb-libs/ir_builders/back_search.hpp
new file mode 100644
index 000000000..35c59b1c4
--- /dev/null
+++ b/irdb-libs/ir_builders/back_search.hpp
@@ -0,0 +1,167 @@
+
+#ifndef back_search_hpp
+#define back_search_hpp
+/*
+ * Copyright (c) 2014 - Zephyr Software LLC
+ *
+ * This file may be used and modified for non-commercial purposes as long as
+ * all copyright, permission, and nonwarranty notices are preserved.
+ * Redistribution is prohibited without prior written consent from Zephyr
+ * Software.
+ *
+ * Please contact the authors for restrictions applying to commercial use.
+ *
+ * THIS SOURCE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
+ * MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Author: Zephyr Software
+ * e-mail: jwd@zephyr-software.com
+ * URL   : http://www.zephyr-software.com/
+ *
+ */
+
+#include <iostream>
+#include <limits>
+#include <string>
+#include <algorithm>
+#include <stdlib.h>
+#include <string.h>
+#include <map>
+#include <assert.h>
+#include <regex.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <cctype>
+
+using namespace IRDB_SDK;
+using namespace std;
+
+/*
+ * defines 
+ */
+#define ALLOF(a) begin(a),end(a)
+
+
+// a way to map an instruction to its set of (direct) predecessors. 
+map< Instruction_t* , InstructionSet_t > preds;
+
+void calc_preds(FileIR_t* firp)
+{
+        preds.clear();
+        for(auto insn : firp->getInstructions())
+        {
+                if(insn->getTarget())
+                        preds[insn->getTarget()].insert(insn);
+                if(insn->getFallthrough())
+                        preds[insn->getFallthrough()].insert(insn);
+        }
+}
+
+
+bool backup_until(const string &insn_type_regex_str, 
+		  Instruction_t *& prev, 
+		  Instruction_t* orig, 
+		  const string & stop_if_set="", 
+		  bool recursive=false, 
+		  uint32_t max_insns=10000u, 
+		  uint32_t max_recursions=5u)
+{
+
+	const auto find_or_build_regex=[&] (const string& s) -> regex_t&
+		{
+			// declare a freer for regexs so they go away when the program ends.
+			const auto regex_freer=[](regex_t* to_free)  -> void
+			{
+				regfree(to_free);
+				delete to_free;
+			};
+			// keep the map safe from anyone but me using it.
+			using regex_unique_ptr_t=unique_ptr<regex_t, decltype(regex_freer)>;
+			static map<string, regex_unique_ptr_t > regexs_used;
+
+			if(s=="")
+			{
+				static regex_t empty;
+				return empty;
+			}
+			const auto it=regexs_used.find(s);
+			if(it==regexs_used.end())
+			{
+				// allocate a new regex ptr
+				regexs_used.insert(pair<string,regex_unique_ptr_t>(s,move(regex_unique_ptr_t(new regex_t, regex_freer))));
+				// and compile it.
+				auto &regex_ptr=regexs_used.at(s);
+				const auto ret=regcomp(regex_ptr.get(), s.c_str(), REG_EXTENDED);
+				// error check
+				assert(ret==0);
+			}
+			return *regexs_used.at(s).get();
+		};
+
+
+	// build regexs.
+	const auto &preg            = find_or_build_regex(insn_type_regex_str);
+	const auto &stop_expression = find_or_build_regex(stop_if_set);
+
+
+	prev=orig;
+	while(preds[prev].size()==1 && max_insns > 0)
+	{
+		// dec max for next loop 
+		max_insns--;
+
+		// get the only item in the list.
+		prev=*(preds[prev].begin());
+	
+
+       		// get I7's disassembly
+		const auto disasm=DecodedInstruction_t::factory(prev);
+
+       		// check it's the requested type
+       		if(regexec(&preg, disasm->getDisassembly().c_str(), 0, nullptr, 0) == 0)
+			return true;
+
+		if(stop_if_set!="")
+		{
+			for(const auto operand : disasm->getOperands())
+			{
+				if(operand->isWritten() && regexec(&stop_expression, operand->getString().c_str(), 0, nullptr, 0) == 0)
+					return false;
+			}
+		}
+
+		// otherwise, try backing up again.
+	}
+	if(recursive && max_insns > 0 && max_recursions > 0 )
+	{
+		const auto myprev=prev;
+		// can't just use prev because recursive call will update it.
+		const auto &mypreds=preds[myprev];
+		for(const auto pred : mypreds)
+		{
+			prev=pred;// mark that we are here, in case we return true here.
+			const auto disasm=DecodedInstruction_t::factory(pred);
+       			// check it's the requested type
+       			if(regexec(&preg, disasm->getDisassembly().c_str(), 0, nullptr, 0) == 0)
+				return true;
+			if(stop_if_set!="")
+			{
+				for(const auto operand : disasm->getOperands())
+				{
+					if(operand->isWritten() && regexec(&stop_expression, operand->getString().c_str(), 0, nullptr, 0) == 0)
+						return false;
+				}
+			}
+			if(backup_until(insn_type_regex_str, prev, pred, stop_if_set, recursive, max_insns, max_recursions/mypreds.size()))
+				return true;
+
+			// reset for next call
+			prev=myprev;
+		}
+	}
+	return false;
+}
+
+
+#endif
diff --git a/irdb-libs/ir_builders/fill_in_cfg.cpp b/irdb-libs/ir_builders/fill_in_cfg.cpp
index 6d5e6ed4f..e1f694fd2 100644
--- a/irdb-libs/ir_builders/fill_in_cfg.cpp
+++ b/irdb-libs/ir_builders/fill_in_cfg.cpp
@@ -27,6 +27,7 @@
 #include <ctype.h>
 #include "elfio/elfio.hpp"
 #include "split_eh_frame.hpp"
+#include "back_search.hpp"
 
 using namespace std;
 using namespace EXEIO;
@@ -494,13 +495,15 @@ void PopulateCFG::fill_in_scoops(FileIR_t *firp)
 
 void PopulateCFG::detect_scoops_in_code(FileIR_t *firp)
 {
+	// make sure preds are up to date for this
+	calc_preds(firp);
+
 	// data for this function
 	auto already_scoopified=map<VirtualOffset_t,DataScoop_t*>();
 
 	const auto is_arm64       = firp->getArchitecture()->getMachineType() == admtAarch64;
 	const auto is_arm32       = firp->getArchitecture()->getMachineType() == admtArm32;
 	const auto is_arm_variant = is_arm32 || is_arm64;
-	const auto do_unpin       = is_arm32;
 
 	// only valid for arm64
 	if(!is_arm_variant) return;
@@ -511,25 +514,71 @@ void PopulateCFG::detect_scoops_in_code(FileIR_t *firp)
 		// look for ldr's with a pcrel operand
 		const auto d               = DecodedInstruction_t::factory(insn);
 		const auto mnemonic        = d->getMnemonic();
+		const auto is_ldrd_variant  = mnemonic.substr(0,4) == "ldrd";
 		const auto is_ldr_variant  = mnemonic.substr(0,3) == "ldr";
 		const auto is_vldr_variant = mnemonic.substr(0,4) == "vldr";
-		const auto is_relevant_ldr = is_ldr_variant || is_vldr_variant;
+		const auto is_relevant_ldr = is_ldr_variant || is_vldr_variant || is_ldrd_variant;
 		if(!is_relevant_ldr) continue;	 
+
+
+		// extract op0
 		const auto op0             = d->getOperand(0);
 
-		// capstone reports ldrd instructions as having 2 "dest" operands.
-		// so we skip to the 3rd operand to get the memory op.  
-		// todo:  fix libirdb-core to fix this and skip the odd operand.
-		// todo:  report to capstone that they are broken.
-		const auto mem_op = mnemonic[3]=='d' ? d->getOperand(2) : d->getOperand(1);
-	       	if( !mem_op->isPcrel()) continue;
+		// the address we detect as referenced by this instruction.
+		auto referenced_address = VirtualOffset_t(0);
+		auto do_unpin       = is_arm32;
+
+		if(is_ldrd_variant && !d->getOperand(2)->isPcrel())
+		{
+			const auto mem_op = d->getOperand(2);
+			if( mem_op->hasIndexRegister() ) 	continue;
+			if( mem_op->getMemoryDisplacement() != 0 ) continue;
+
+			const auto mem_str         = mem_op->getString();
+			const auto end_of_base_reg = mem_str.find(" ");
+			const auto find_reg        = mem_str.substr(0,end_of_base_reg);
+
+			// find the instruction that sets the base reg.
+			auto add_pc_insn = (Instruction_t*)nullptr;
+			if(!backup_until( string()+"add.* "+find_reg+", pc, #",  /* look for this pattern. */
+						add_pc_insn,        /* strong instruction in add_pc_insn */
+						insn,               /* insn I10 */
+						"^"+find_reg+"$"    /* stop if find_reg set */
+						))
+			{
+				continue;
+			}
+
+			const auto add_pc_insn_d = DecodedInstruction_t::factory(add_pc_insn);
+
+			// record to unpin something.
+			referenced_address = add_pc_insn_d -> getOperand(2)->getConstant() + (is_arm32 ? add_pc_insn->getAddress()->getVirtualOffset() + 8 : 0); 
+			do_unpin           = false;
+
+		}
+		else
+		{
+
+			// capstone reports ldrd instructions as having 2 "dest" operands.
+			// so we skip to the 3rd operand to get the memory op.  
+			// todo:  fix libirdb-core to fix this and skip the odd operand.
+			// todo:  report to capstone that they are broken.
+			const auto mem_op = d->getOperand(1);
+			if( !mem_op->isPcrel()) continue;
+
+			// if there is an indexing operation, skip this instruction.
+			if( mem_op->hasIndexRegister()) continue;
+
+			// sanity check that it's a memory operation, and extract fields
+			assert(mem_op->isMemory());
+
+			
+			referenced_address = mem_op->getMemoryDisplacement() + (is_arm32 ? insn->getAddress()->getVirtualOffset() + 8 : 0); 
+		}
 
-		// if there is an indexing operation, skip this instruction.
-	       	if( mem_op->hasIndexRegister()) continue;
+		// no address found.
+		if(referenced_address == 0) continue;
 
-		// sanity check that it's a memory operation, and extract fields
-		assert(mem_op->isMemory());
-		const auto referenced_address = mem_op->getMemoryDisplacement() + (is_arm32 ? insn->getAddress()->getVirtualOffset() + 8 : 0); 
 		const auto name           = "data_in_text_"+to_hex_string(referenced_address);
 		const auto op0_str            = op0->getString();
 
@@ -537,7 +586,8 @@ void PopulateCFG::detect_scoops_in_code(FileIR_t *firp)
 		if(is_arm32 && op0_str == "pc" ) continue;
 
 		const auto referenced_size    =  // could use API call?
-			is_arm64 && op0_str[0]=='w' ? 4  :  // arm64 regs
+			is_ldrd_variant             ? 8  : // special case for load int reg pair.
+			is_arm64 && op0_str[0]=='w' ? 4  : // arm64 regs
 			is_arm64 && op0_str[0]=='x' ? 8  : 
 			is_arm64 && op0_str[0]=='s' ? 4  : 
 			is_arm64 && op0_str[0]=='d' ? 8  : 
diff --git a/irdb-libs/ir_builders/fill_in_indtargs.cpp b/irdb-libs/ir_builders/fill_in_indtargs.cpp
index 93d0b963d..99da1812d 100644
--- a/irdb-libs/ir_builders/fill_in_indtargs.cpp
+++ b/irdb-libs/ir_builders/fill_in_indtargs.cpp
@@ -43,6 +43,7 @@
 #include "check_thunks.hpp"
 #include "fill_in_indtargs.hpp"
 #include "libMEDSAnnotation.h"
+#include "back_search.hpp"
 
 using namespace IRDB_SDK;
 using namespace std;
@@ -95,8 +96,10 @@ map<VirtualOffset_t,ibt_provenance_t> targets;
 // the set of ranges represented by the eh_frame section, could be empty for non-elf files.
 set< pair< VirtualOffset_t, VirtualOffset_t> > ranges;
 
+#if 0
 // a way to map an instruction to its set of (direct) predecessors. 
 map< Instruction_t* , InstructionSet_t > preds;
+#endif
 
 // keep track of jmp tables
 map< Instruction_t*, fii_icfs > jmptables;
@@ -544,7 +547,7 @@ set<Instruction_t*> find_in_function(string needle, Function_t *haystack)
 }
 
 
-
+#if 0
 bool backup_until(const string &insn_type_regex_str, 
 		  Instruction_t *& prev, 
 		  Instruction_t* orig, 
@@ -648,6 +651,7 @@ bool backup_until(const string &insn_type_regex_str,
 	}
 	return false;
 }
+#endif
 
 
 void check_for_arm32_switch_type1(
@@ -2742,6 +2746,7 @@ void check_for_nonPIC_switch_table(FileIR_t* firp, Instruction_t* insn, const De
 	jmptables[IJ].setAnalysisStatus(iasAnalysisComplete);
 }
 
+#if 0
 void calc_preds(FileIR_t* firp)
 {
         preds.clear();
@@ -2753,6 +2758,7 @@ void calc_preds(FileIR_t* firp)
                         preds[insn->getFallthrough()].insert(insn);
         }
 }
+#endif
 
 void handle_takes_address_annot(FileIR_t* firp,Instruction_t* insn, MEDS_TakesAddressAnnotation* p_takes_address_annotation)
 {
-- 
GitLab