From 99dc59d8e2a9d31909cf354e7d10c0439afe17b9 Mon Sep 17 00:00:00 2001
From: Jason Hiser <jdhiser@gmail.com>
Date: Thu, 22 Jun 2023 02:53:30 +0000
Subject: [PATCH] Working parser??

---
 src/arm_ehp.cpp  | 303 ++++++++++++++++++++++++++++++++---------------
 src/ehp_priv.hpp |  52 +++++++-
 test/test.cpp    |   1 +
 3 files changed, 258 insertions(+), 98 deletions(-)

diff --git a/src/arm_ehp.cpp b/src/arm_ehp.cpp
index 7170daf..723f237 100644
--- a/src/arm_ehp.cpp
+++ b/src/arm_ehp.cpp
@@ -27,7 +27,7 @@ bool split_arm_eh_frame_impl_t<ptrsize>::parse(const bool is_be)
 		const auto can_unwind = second_entry != 0x1; // can't unwind == 1:
 		const auto offset_to_start = handle_pcrel31(second_entry);
 		const auto contains_inline_unwind_entry = can_unwind && (second_entry>>31); // is inline if bit 31 set, and not special pattern cant_unwind.
-		const auto inline_unwind_entry = second_entry & 0x7fffffff;	            // the EH unwind table entry itself if it can be encoded in 31 bits.
+		//const auto inline_unwind_entry = second_entry & 0x7fffffff;	            // the EH unwind table entry itself if it can be encoded in 31 bits.
 		const auto lsda_addr = 
 			!can_unwind    ? 0 :					 // the special pattern 0x1 indicating can't unwind.
 			contains_inline_unwind_entry ? 0 :			 // no lsda addr if the entry is inline
@@ -54,103 +54,13 @@ bool split_arm_eh_frame_impl_t<ptrsize>::parse(const bool is_be)
 		{
 			// fetch the first word of the lsda.
 			throw_assert(extab_scoop->getStart() <= lsda_addr && lsda_addr <= extab_scoop->getEnd());
-
-			// note:  do not make reference as we are going to do unsafe stuff.
-			// and need a copy that won't change.
-			const auto contents_str = extab_scoop->getContents();
-			const auto contents = reinterpret_cast<const char*>(contents_str.data());
-			const auto start_offset = lsda_addr - extab_scoop->getStart();
-
-			// fetch 4 bytes to detect type
-			if(lsda_addr + sizeof(uint32_t) > extab_scoop->getEnd())
-				throw out_of_range("Cannot parse lsda at " + to_hex_string(lsda_addr));
-			const auto first_word = *reinterpret_cast<const uint32_t*>(&contents[start_offset]);
-			if(first_word >> 31) // check the top bit.
-			{
-				const auto byte1 = (first_word >> 0)&0xff;
-				const auto byte2 = (first_word >> 8)&0xff;
-				const auto byte3 = (first_word >> 16)&0xff;
-				const auto byte4 = (first_word >> 24)&0xff;
-				const auto personality_index = byte4 & 0xf;
-				cout << "Found arm32-specific model = " << hex << personality_index << endl; 
-				switch(personality_index)
-				{
-					case 0:
-					{
-						unwind_pgm.push_back(byte3);
-						unwind_pgm.push_back(byte2);
-						unwind_pgm.push_back(byte1);
-						break;
-					}
-					case 1:
-					case 2:
-					{
-						const auto words_following = byte3;
-						unwind_pgm.push_back(byte2);
-						unwind_pgm.push_back(byte1);
-						for(auto i = 0u; i < words_following; i++)
-						{
-							const auto next_word = *reinterpret_cast<const uint32_t*>(&contents[start_offset+4+i*4]);
-							unwind_pgm.push_back((next_word >> 24)&0xff);
-							unwind_pgm.push_back((next_word >> 16)&0xff);
-							unwind_pgm.push_back((next_word >> 8 )&0xff);
-							unwind_pgm.push_back((next_word >> 0 )&0xff);
-						}
-
-						break;
-					}
-					default:
-						throw new out_of_range("Unknown personality index: "+ to_string(personality_index));
-				}
-
-			}
-			else
-			{
-				// generic version.
-				const auto offset_to_personality_routine = handle_pcrel31(first_word);
-				const auto personality_routine_addr=lsda_addr+offset_to_personality_routine;
-				fde.setPersonality(personality_routine_addr);
-				cout << "Found generic model with personality = " << hex << personality_routine_addr << endl; 
-				const auto second_word = *reinterpret_cast<const uint32_t*>(&contents[start_offset+4]);
-				const auto byte1 = (second_word >> 0 )&0xff;
-				const auto byte2 = (second_word >> 8 )&0xff;
-				const auto byte3 = (second_word >> 16)&0xff;
-				const auto byte4 = (second_word >> 24)&0xff;
-				const auto words_following = byte4;
-				unwind_pgm.push_back(byte3);
-				unwind_pgm.push_back(byte2);
-				unwind_pgm.push_back(byte1);
-				for(auto i = 0u; i < words_following; i++)
-				{
-					const auto next_word = *reinterpret_cast<const uint32_t*>(&contents[start_offset+4+i*4]);
-					unwind_pgm.push_back((next_word >> 24)&0xff);
-					unwind_pgm.push_back((next_word >> 16)&0xff);
-					unwind_pgm.push_back((next_word >> 8 )&0xff);
-					unwind_pgm.push_back((next_word >> 0 )&0xff);
-				}
-
-				// 4 for personality routine, + 1 for a length specifier + the length in bytes.
-				fde.parse_lsda(lsda_addr+8+words_following*4,extab_scoop.get(), fde_start, is_be);
-			}
+			cout << "Found out-of-line unwind info." << endl << hex;
+			unwind_pgm=parse_arm_eh_pgm(lsda_addr,extab_scoop.get(),fde, is_be);
 		}
 		if(contains_inline_unwind_entry )
 		{
-				const auto byte1 = (inline_unwind_entry >> 0)&0xff;
-				const auto byte2 = (inline_unwind_entry >> 8)&0xff;
-				const auto byte3 = (inline_unwind_entry >> 16)&0xff;
-				const auto byte4 = (inline_unwind_entry >> 24)&0x7f; // drop high bit of high word for pcrel31.
-				unwind_pgm.push_back(byte4);
-				unwind_pgm.push_back(byte3);
-				unwind_pgm.push_back(byte2);
-				unwind_pgm.push_back(byte1);
-				cout << "Found arm32-specific inline_entra with 4 instructions:"  << endl << hex;
-				/*
-				cout
-				     <<  "\t\t" << byte1 << endl
-				     <<  "\t\t" << byte2 << endl
-				     <<  "\t\t" << byte3 << endl
-				     <<  "\t\t" << byte4 << endl;
-				*/
+			cout << "Found inline_entry:"  << endl << hex;
+			unwind_pgm=parse_arm_eh_pgm(current_address+4,exidx_scoop.get(),fde, is_be);
 		}
 		cout << "\tFde ("<< fde.getStartAddress();
 		cout << "Unwind pgm = " << hex << endl;
@@ -158,6 +68,7 @@ bool split_arm_eh_frame_impl_t<ptrsize>::parse(const bool is_be)
 		{
 			cout << "\t" << +byte << endl;
 		}
+		fde.setProgram(arm_eh_program_t<ptrsize>{unwind_pgm});
 		local_fdes.push_back(fde);
 
 		fde_idx += 2;
@@ -165,7 +76,7 @@ bool split_arm_eh_frame_impl_t<ptrsize>::parse(const bool is_be)
 	}
 	// last fde goes to the end of the linked section.
 	local_fdes[local_fdes.size()-1].setEndAddress(lnk_scoop->getEnd());
-
+	
 /*
 	for(const auto& fde: local_fdes)
 	{
@@ -183,6 +94,92 @@ bool split_arm_eh_frame_impl_t<ptrsize>::parse(const bool is_be)
 	return true;
 }
 
+template <int ptrsize>
+vector<uint8_t> split_arm_eh_frame_impl_t<ptrsize>::parse_arm_eh_pgm(const uint64_t lsda_addr, const ScoopReplacement_t *lsda_scoop, arm_fde_contents_t<ptrsize> &fde, const bool is_be)
+{
+	auto unwind_pgm=vector<uint8_t>();
+	const auto fde_start=fde.getStartAddress();
+
+	// note:  do not make reference as we are going to do unsafe stuff.
+	// and need a copy that won't change.
+	const auto contents_str = lsda_scoop->getContents();
+	const auto contents = reinterpret_cast<const char*>(contents_str.data());
+	const auto start_offset = lsda_addr - lsda_scoop->getStart();
+
+	// fetch 4 bytes to detect type
+	if(lsda_addr + sizeof(uint32_t) > lsda_scoop->getEnd())
+		throw out_of_range("Cannot parse lsda at " + to_hex_string(lsda_addr));
+	const auto first_word = *reinterpret_cast<const uint32_t*>(&contents[start_offset]);
+	if(first_word >> 31) // check the top bit.
+	{
+		const auto byte1 = (first_word >> 0)&0xff;
+		const auto byte2 = (first_word >> 8)&0xff;
+		const auto byte3 = (first_word >> 16)&0xff;
+		const auto byte4 = (first_word >> 24)&0xff;
+		const auto personality_index = byte4 & 0xf;
+		cout << "Found arm32-specific personality routine, pr" << hex << personality_index << endl; 
+		switch(personality_index)
+		{
+			case 0:
+				{
+					unwind_pgm.push_back(byte3);
+					unwind_pgm.push_back(byte2);
+					unwind_pgm.push_back(byte1);
+					break;
+				}
+			case 1:
+			case 2:
+				{
+					const auto words_following = byte3;
+					unwind_pgm.push_back(byte2);
+					unwind_pgm.push_back(byte1);
+					for(auto i = 0u; i < words_following; i++)
+					{
+						const auto next_word = *reinterpret_cast<const uint32_t*>(&contents[start_offset+4+i*4]);
+						unwind_pgm.push_back((next_word >> 24)&0xff);
+						unwind_pgm.push_back((next_word >> 16)&0xff);
+						unwind_pgm.push_back((next_word >> 8 )&0xff);
+						unwind_pgm.push_back((next_word >> 0 )&0xff);
+					}
+
+					break;
+				}
+			default:
+				throw new out_of_range("Unknown personality index: "+ to_string(personality_index));
+		}
+
+	}
+	else
+	{
+		// generic version.
+		const auto offset_to_personality_routine = handle_pcrel31(first_word);
+		const auto personality_routine_addr=lsda_addr+offset_to_personality_routine;
+		fde.setPersonality(personality_routine_addr);
+		cout << "Found generic model with personality = " << hex << personality_routine_addr << endl; 
+		const auto second_word = *reinterpret_cast<const uint32_t*>(&contents[start_offset+4]);
+		const auto byte1 = (second_word >> 0 )&0xff;
+		const auto byte2 = (second_word >> 8 )&0xff;
+		const auto byte3 = (second_word >> 16)&0xff;
+		const auto byte4 = (second_word >> 24)&0xff;
+		const auto words_following = byte4;
+		unwind_pgm.push_back(byte3);
+		unwind_pgm.push_back(byte2);
+		unwind_pgm.push_back(byte1);
+		for(auto i = 0u; i < words_following; i++)
+		{
+			const auto next_word = *reinterpret_cast<const uint32_t*>(&contents[start_offset+8+i*4]);
+			unwind_pgm.push_back((next_word >> 24)&0xff);
+			unwind_pgm.push_back((next_word >> 16)&0xff);
+			unwind_pgm.push_back((next_word >> 8 )&0xff);
+			unwind_pgm.push_back((next_word >> 0 )&0xff);
+		}
+
+		// 4 for personality routine, + 1 for a length specifier + the length in bytes.
+		fde.parse_lsda(lsda_addr+8+words_following*4,lsda_scoop, fde_start, is_be);
+	}
+	return unwind_pgm;
+}
+
 unique_ptr<const EHFrameParser_t> EHFrameParser_t::arm_factory(
 	uint8_t ptrsize,
 	EHPEndianness_t endian_type,
@@ -253,3 +250,117 @@ const FDEContents_t* split_arm_eh_frame_impl_t<ptrsize>::findFDE(uint64_t addr)
         return raw_ret_ptr;
 }
 
+template <int ptrsize>
+const EHProgramInstructionVector_t* arm_eh_program_t<ptrsize>::getInstructions() const
+{
+        if(instructions_cache.size()==0)
+        {
+                transform(ALLOF(instructions), back_inserter(instructions_cache), [](const arm_eh_program_insn_t<ptrsize> &a) { return &a;});
+        }
+        return &instructions_cache;
+}
+
+template <int ptrsize>
+arm_eh_program_t<ptrsize>::arm_eh_program_t(const vector<uint8_t>&  unwind_pgm)
+{
+	auto unwind_idx=0u;
+	const auto unwind_pgm_data = unwind_pgm.data();
+	while(unwind_idx < unwind_pgm.size())
+	{
+		const auto opcode_byte1=unwind_pgm[unwind_idx];
+		const auto top_two=opcode_byte1>>6;
+		const auto top_four=opcode_byte1>>4;
+		const auto top_five=opcode_byte1>>3;
+		const auto bottom_three=opcode_byte1&0b111;
+		const auto bits543=(opcode_byte1>>3)&0b111;
+
+		//  see https://github.com/ARM-software/abi-aa/blob/main/ehabi32/ehabi32.rst#frame-unwinding-instructions
+		if(
+			top_two == 0b00 || 		// vsp=vsp+6-bit-immed
+			top_two == 0b01 || 		// vsp=vsp-6-bit-immed
+			top_four==0b1001 ||  		// set vsp=r[immed] || reserved if immed==13||15
+			top_four==0b1010 || 		// pop r4-r[4+immed] || pop r4-r[4+immed]+r14
+			opcode_byte1 == 0xb0 ||  	// finish
+			opcode_byte1 == 0b10110100 ||	// Pop Return Address Authentication Code pseudo-register (see remark g)
+			opcode_byte1 == 0b10110101 ||	// Use current vsp as modifier in Return Addresss Authentication (see remark h)
+			opcode_byte1 == 0b10110110 ||	// Spare (was Pop FPA)
+			opcode_byte1 == 0b10110111 ||	// Spare (was Pop FPA)
+			top_five == 0b10111	   ||	// Pop VFP double-precision registers D[8]-D[8+nnn] saved (as if) by FSTMFDX (see remark d)
+			(top_five == 0b11000 && bottom_three != 6 && bottom_three !=7)
+						   ||	// Intel Wireless MMX pop wR[10]-wR[10+nnn]
+			(top_five == 0b11001 && bottom_three != 0 && bottom_three !=1)
+						   ||	// Spare (yyy != 000, 001)
+			top_five == 0b10111	   ||	// Pop VFP double-precision registers D[8]-D[8+nnn] saved (as if) by VPUSH (see remark d)
+			(top_two == 0b11&& bits543 != 0b000 && bits543 != 0b001 && bits543 != 0b010)
+							// Spare (xxx != 000, 001, 010)
+
+
+		)
+		{
+			// cout << "Found 1 byte unwind arm insn" << endl;
+			instructions.push_back(arm_eh_program_insn_t<ptrsize>{string{1,opcode_byte1}});
+			unwind_idx++;
+		}
+		else if(
+			top_four == 0b1000		|| // 12-bit immed==0 ? refuse to unwind : pop registers indicated by immed
+			opcode_byte1 == 0b10110001 	|| // spare || pop registers
+			opcode_byte1 == 0b10110011  	|| // pop vfp double registers.
+			(top_five == 0b11000 && bottom_three == 6) 
+							|| // Intel Wireless MMX pop wR[ssss]-wR[ssss+cccc] (see remark e)
+			(top_five == 0b11000 && bottom_three == 7) 
+							|| // Spare || Intel Wireless MMX pop wCGR registers under mask {wCGR3,2,1,0} || Spare (xxxx != 0000) 
+			opcode_byte1 == 0b11001000   	|| // Pop VFP double precision registers D[16+ssss]-D[16+ssss+cccc] saved (as if) by VPUSH (see remarks d,e)
+			opcode_byte1 == 0b11001001   	   // Pop VFP double precision registers D[ssss]-D[ssss+cccc] saved (as if) by VPUSH (see remark d)
+		)
+		{
+			// cout << "Found 2 byte arm insn" << endl;
+			unwind_idx++;
+			if(unwind_idx>=unwind_pgm.size())
+				throw runtime_error("Cannot decode arm32 unwind instruction with prefix 0b1000");
+			const auto opcode_byte2=unwind_pgm[unwind_idx];
+			instructions.push_back(arm_eh_program_insn_t<ptrsize>{string{opcode_byte1,opcode_byte2}});
+			unwind_idx++;
+		}
+		else if (
+			opcode_byte1 == 0b10110010 	// vsp += uleb128
+		)
+		{
+			// declare vars needed to call uleb routine.
+			const auto initial_pos=uint64_t{unwind_idx+1};
+			const auto max=initial_pos+unwind_pgm.size();
+			auto final_pos=initial_pos;	// updated by read_uleb
+			auto res=uint64_t{0};			// ignore output of read_uleb, just need length
+			// read uleb128 and sanity check.
+			const auto fail = eh_frame_util_t<ptrsize>::read_uleb128(res,final_pos,unwind_pgm_data,max);
+			if(fail)
+				throw new out_of_range("Unable to read uleb128 in unwind_pgm");
+
+			// calc uleb length and record instructions..
+			const auto uleb_len=final_pos-initial_pos;
+			instructions.push_back(arm_eh_program_insn_t<ptrsize>(string(reinterpret_cast<const char*>(unwind_pgm_data+unwind_idx), 1+uleb_len)));
+			const auto insn_len = 1+uleb_len;
+			// cout << "Found multi-byte ( " << insn_len << " bytes) arm32 instructions" << endl;
+			unwind_idx+=insn_len;
+		}
+		else
+			throw new out_of_range("Cannot determine arm32 unwind instruction length");
+	}
+}
+
+template <int ptrsize>
+void arm_eh_program_t<ptrsize>::print(const uint64_t pc, const int64_t caf) const 
+{
+	auto tmp_pc=pc;
+	for(const auto &insn : instructions)
+		insn.print(tmp_pc,caf);
+}
+
+template <int ptrsize>
+void arm_eh_program_insn_t<ptrsize>::print(uint64_t &pc, int64_t caf) const 
+{
+	cout <<"arm32 unwind insn len=" << dec << program_bytes.size() << "bytes = ";
+	for(const auto byte : program_bytes)
+		cout << hex << +byte << ", ";
+	cout << endl;
+}
+
diff --git a/src/ehp_priv.hpp b/src/ehp_priv.hpp
index 21fae42..dbbb026 100644
--- a/src/ehp_priv.hpp
+++ b/src/ehp_priv.hpp
@@ -551,6 +551,51 @@ class arm_cie_contents_t : public CIEContents_t, private eh_frame_util_t<ptrsize
 
 };
 
+template <int ptrsize>
+class arm_eh_program_insn_t  : public EHProgramInstruction_t
+{
+	public: 
+	
+	arm_eh_program_insn_t() ;
+	arm_eh_program_insn_t(const string &s) 
+		:	
+		program_bytes(begin(s),end(s))
+	{
+	}
+
+
+	virtual ~arm_eh_program_insn_t() {}
+        virtual void print(uint64_t &pc, int64_t caf=1) const ;
+        virtual tuple<string, int64_t, int64_t> decode() const { throw std::runtime_error("not implemented"); }
+        virtual uint64_t getSize() const { return program_bytes.size(); }
+        virtual bool isNop() const { return false; }
+        virtual bool isDefCFAOffset() const { return false; }
+        virtual bool isRestoreState() const { return false; }
+        virtual bool isRememberState() const { return false; }
+        virtual const EHProgramInstructionByteVector_t& getBytes() const { return program_bytes; }
+        virtual bool advance(uint64_t &cur_addr, uint64_t CAF)     const { throw std::runtime_error("not implemented");  }
+
+	private:
+
+	vector<uint8_t> program_bytes;
+};
+
+template <int ptrsize>
+class arm_eh_program_t : public EHProgram_t
+{
+	public:
+	arm_eh_program_t(const vector<uint8_t>&  unwind_pgm={}); 
+        virtual const EHProgramInstructionVector_t* getInstructions() const;
+	vector<arm_eh_program_insn_t <ptrsize> >& getInstructionsInternal() { return instructions; }
+	const vector<eh_program_insn_t <ptrsize> >& getInstructionsInternal() const { return instructions; }
+	void print(const uint64_t start_addr, const int64_t caf) const;
+
+	private:
+	vector<arm_eh_program_insn_t <ptrsize> > instructions;
+	mutable EHProgramInstructionVector_t instructions_cache;
+};
+
+
 template <int ptrsize>
 class arm_fde_contents_t : public FDEContents_t, eh_frame_util_t<ptrsize> 
 {
@@ -559,7 +604,7 @@ class arm_fde_contents_t : public FDEContents_t, eh_frame_util_t<ptrsize>
 	uint64_t fde_lsda_addr=0;
 	uint32_t can_unwind=false;
 	lsda_t<ptrsize> lsda;
-	eh_program_t<ptrsize> eh_pgm;
+	arm_eh_program_t<ptrsize> eh_pgm;
 	arm_cie_contents_t<ptrsize> cie;
 	public:
 	arm_fde_contents_t( uint64_t fde_start,uint64_t lsda_addr,bool p_can_unwind) :
@@ -580,7 +625,7 @@ class arm_fde_contents_t : public FDEContents_t, eh_frame_util_t<ptrsize>
 	virtual uint64_t getFDEStartAddress() const { return fde_start_addr; } 
 	virtual uint64_t getFDEEndAddress() const {return fde_end_addr; }
         virtual const CIEContents_t& getCIE() const { return cie; }
-        virtual const EHProgram_t& getProgram() const { throw std::runtime_error(" not implimented"); }
+        virtual const EHProgram_t& getProgram() const { return eh_pgm; }
         virtual const LSDA_t* getLSDA() const { return &lsda; }
         virtual uint64_t getLSDAAddress() const { return fde_lsda_addr; }
         virtual uint64_t getStartAddressPosition() const { throw std::runtime_error(" not implimented"); }
@@ -593,6 +638,7 @@ class arm_fde_contents_t : public FDEContents_t, eh_frame_util_t<ptrsize>
 	void setEndAddress(uint64_t end) { fde_end_addr = end; }
 	bool getCanUnwind() const { return can_unwind; }
 	void setPersonality(uint64_t pers) { cie.setPersonality(pers); }
+	void setProgram(const arm_eh_program_t<ptrsize>&  pgm) { eh_pgm=pgm; }
 
 	bool parse_lsda(const uint64_t lsda_addr, 
 			const ScoopReplacement_t* extab_scoop,
@@ -622,6 +668,8 @@ class split_arm_eh_frame_impl_t : public EHFrameParser_t
 	mutable CIEVector_t cies_cache;
 	mutable FDEVector_t fdes_cache;
 
+	vector<uint8_t> parse_arm_eh_pgm(const uint64_t lsda_addr, const ScoopReplacement_t *lsda_scoop, arm_fde_contents_t<ptrsize> &fde, const bool is_be);
+
 	public:
 
 	split_arm_eh_frame_impl_t
diff --git a/test/test.cpp b/test/test.cpp
index 81f7b45..8924267 100644
--- a/test/test.cpp
+++ b/test/test.cpp
@@ -38,6 +38,7 @@ void print_lps(const EHFrameParser_t* ehp)
 	for(const auto fde : *fdes)
 	{
 		cout<<"Found FDE at : " << fde->getStartAddress() << "-"<<fde->getEndAddress()<<endl;
+		fde->getProgram().print(fde->getStartAddress(),1);
 		const auto lsda=fde->getLSDA();
 		assert(lsda);
 		lsda->print();
-- 
GitLab