小记 - 基于二进制的正则匹配

0. 起因

今天是2025年3月15日的0点34分,我的舍友此时在王者荣耀双排开黑,而此时的我既十分惊讶又十分后悔。惊讶是因为我发现了新的二进制正则匹配办法,而且这个办法比我之前想到的还要好,后悔就后悔我居然今天才想出。

1. 第一种办法

我之前想到的第一种办法是:硬解析二进制

#include <boost/regex.hpp>

// 2025年3月11日: 经过排查,std::regex只支持有符号型,即根本不支持uint8_t

struct _Range {
	size_t start_pos;
	size_t end_pos;
};

using MatchItem = std::vector<std::pair<std::basic_string_view<std::uint8_t>, _Range>>;
using MatchList = std::vector<MatchItem>;

std::basic_string<uint8_t> bin_escape_seq(const std::basic_string<uint8_t>& str) {
	const std::regex specialCharsRegex(R"([.^$*+?()\[\]{}|\\])");
	auto res = std::regex_replace(std::string(str.begin(), str.end()), specialCharsRegex, R"(\$&)");
	return std::basic_string<uint8_t>(res.begin(), res.end());
}

std::basic_string<uint8_t> bin_replace(std::basic_string_view<uint8_t> str, std::basic_string_view<uint8_t> regex, std::basic_string_view<uint8_t> replaceStr) {
	std::regex pattern(std::string(regex.begin(), regex.end()));
	auto res = std::regex_replace(std::string(str.begin(), str.end()), pattern, std::string(replaceStr.begin(), replaceStr.end()));
	return std::basic_string<uint8_t>(res.begin(), res.end());
}

bool bin_match(std::basic_string_view<uint8_t> str, std::basic_string_view<uint8_t> regex) {
	return std::regex_match(std::string(str.begin(), str.end()), std::regex(std::string(regex.begin(), regex.end())));
}

MatchList bin_search(std::basic_string_view<std::uint8_t> str, std::basic_string_view<uint8_t> regex) {
	MatchList ret{};

	auto begin = str.cbegin();
	auto end = str.cend();
	std::match_results<decltype(str)::const_iterator> m;
	std::regex r(std::string(regex.begin(), regex.end()));

	while (std::regex_search(begin, end, m, r)) {
		begin = m[0].second;
		MatchItem item{};
		for (size_t i = 0; i < m.size(); i++) {
			_Range pos{
				static_cast<size_t>(m[i].first - str.begin()),
				static_cast<size_t>(m[i].second - str.begin()),
			};
			size_t sub_size = static_cast<size_t>(m[i].second - m[i].first);
			decltype(str) sub = str.substr(pos.start_pos, sub_size);
			item.emplace_back(sub, pos);
		}
		ret.push_back(item);
	}

	return ret;
}

就是这样的代码,如此直接,把二进制的数据硬直直地带入到正则匹配中,且我也想到regex模式串可能会存在需要转义的情况,因此就搭配bin_escape_seq()来实现转义

然后真实的应用场景就是这样。。。

std::basic_string<uint8_t> sdk_begin_sign = bin_escape_seq({ 0xEB,0x0F,0x4F,0x50,0x72,0x6F,0x74,0x65,0x63,0x74,0x20,0x42,0x65,0x67,0x69,0x6E,0x00 });
std::basic_string<uint8_t> sdk_end_sign = bin_escape_seq({ 0xEB,0x0D,0x4F,0x50,0x72,0x6F,0x74,0x65,0x63,0x74,0x20,0x45,0x6E,0x64,0x00 });

std::basic_string<uint8_t> sec_content(sec.content().begin(), sec.content().end());

auto matchs = bin_search(sec_content, sdk_begin_sign + std::basic_string<uint8_t>((const uint8_t*)"(.*)") + sdk_end_sign); 
			
for (auto& match : matchs){
	func.begin = off + match[0].second.start_pos;
	func.end = off + match[0].second.end_pos;
	func.size = func.end - func.begin;
	result.push_back(func);
}

我来总结下这种办法的优缺点

优点:

  • 方便直接,省空间

缺点:

  • 模式串在代码里特别不好看
  • 正则匹配容易出现问题(有错误和漏缺)
  • 模式串还有0x00截断

总结下来就是,太姬芭沙比了

2. 第二种办法

灵感是从 ghidra 和 ida 想到的

我们都说,正则匹配一般是为可见字符(也可以叫"可打印字符")服务的,像上面这种硬是给不可见字符(也可以叫"不可打印字符")服务,不仅正则引擎嫌难受,我们看着用着也难受

要想得到最有效的结果,基本的原则自然就是要与正则引擎"门当户对"。可见字符是吧,我们直接把二进制数据解析成可见字符不就行了?

把一个个字节变成一个个可见字符,就可以了

比如有一个字节 0x3C,我们将其解析成两个'3''C'字符,换成ASCII码形式就是0x33,0x43

一个字节总共就有16种字符标识,0123456789ABCDEF
就这么简单,16个字符里没有一个需要特殊转义的,用就行了,而且在代码里边看的也非常顺(要是嫌太长,自己写个宏、保存到文件里再读取都行)

我想到的唯一的缺点,就是太占内存 空间了,一个字节就是两个十六进制字符,那么将这两个十六进制字符保存本身就需要两个字节的长度,也就是说,如果要匹配100字节的数据,你就得准备200字节的空间

功能:

struct _Range {
	size_t start_pos;
	size_t end_pos;
};

using MatchItem = std::vector<std::pair<std::string_view, _Range>>;
using MatchList = std::vector<MatchItem>;

std::string regex_escape_seq(std::string_view str) {
	const std::regex _regex(R"([.^$*+?()\[\]{}|\\])");
	return std::regex_replace(std::string(str), _regex, R"(\$&)");
}

std::string regex_replace(std::string_view str, std::string_view regex, std::string_view replaceStr) {
	std::regex pattern(std::string(regex.begin(), regex.end()));
	return std::regex_replace(std::string(str), pattern, std::string(replaceStr));
}

bool regex_match(std::string_view str, std::string_view regex) {
	return std::regex_match(std::string(str), std::regex(std::string(regex)));
}

MatchList regex_search(std::string_view str, std::string_view regex) {
	MatchList ret{};

	auto begin = str.cbegin();
	auto end = str.cend();
	std::match_results<decltype(str)::const_iterator> m;
	std::regex r(std::string(regex.begin(), regex.end()));

	while (std::regex_search(begin, end, m, r)) {
		begin = m[0].second;
		MatchItem item{};
		for (size_t i = 0; i < m.size(); i++) {
			_Range pos{
				static_cast<size_t>(m[i].first - str.begin()),
				static_cast<size_t>(m[i].second - str.begin()),
			};
			size_t sub_size = static_cast<size_t>(m[i].second - m[i].first);
			decltype(str) sub = str.substr(pos.start_pos, sub_size);
			item.emplace_back(sub, pos);
		}
		ret.push_back(item);
	}

	return ret;
}

std::string bytes2String(uint8_t bytehex, bool uppercase) {
    const char* hex_chars = uppercase ? "0123456789ABCDEF" : "0123456789abcdef";
    std::string result(2, '\0');

    result[0] = hex_chars[(bytehex >> 4) & 0xF];
    result[1] = hex_chars[bytehex & 0xF];

    return result;
}

std::string convertByteString(const std::basic_string<uint8_t>& bytes, bool uppercase = true) {
    std::string result;
    result.reserve(bytes.size() * 2);

    for (const auto& byte : bytes) {
        result += bytes2String(byte, uppercase);
    }

    return result;
}

std::basic_string<uint8_t> convertBytes(const std::string& hexStr) {
    if (hexStr.length() % 2 != 0) {
        throw std::invalid_argument("Hex string length must be even");
    }

    std::basic_string<uint8_t> result;
    result.reserve(hexStr.length() / 2);

    for (size_t i = 0; i < hexStr.length(); i += 2) {
        char highNibble = hexStr[i];
        char lowNibble = hexStr[i + 1];

        uint8_t high;
        if (highNibble >= '0' && highNibble <= '9') {
            high = (highNibble - '0') << 4;
        }
        else if (highNibble >= 'A' && highNibble <= 'F') {
            high = (highNibble - 'A' + 10) << 4;
        }
        else if (highNibble >= 'a' && highNibble <= 'f') {
            high = (highNibble - 'a' + 10) << 4;
        }
        else {
            throw std::invalid_argument("Invalid hex character");
        }

        uint8_t low;
        if (lowNibble >= '0' && lowNibble <= '9') {
            low = lowNibble - '0';
        }
        else if (lowNibble >= 'A' && lowNibble <= 'F') {
            low = lowNibble - 'A' + 10;
        }
        else if (lowNibble >= 'a' && lowNibble <= 'f') {
            low = lowNibble - 'a' + 10;
        }
        else {
            throw std::invalid_argument("Invalid hex character");
        }

        result.push_back(high | low);
    }

    return result;
}

测试代码:

int main() {
    std::string _sdk_begin_sign = std::string() + '\xEB' + '\x0F' + '\x4F' + '\x50' + '\x72' + '\x6F' + '\x74' + '\x65' + '\x63' + '\x74' + '\x20' + '\x42' + '\x65' + '\x67' + '\x69' + '\x6E' + '\x00';
    std::string _sdk_end_sign = std::string() + '\xEB' + '\x0D' + '\x4F' + '\x50' + '\x72' + '\x6F' + '\x74' + '\x65' + '\x63' + '\x74' + '\x20' + '\x45' + '\x6E' + '\x64' + '\x00';
    
    std::string sdk_begin_sign = "EB0F4F50726F7465637420426567696E00";
    std::string sdk_end_sign = "EB0D4F50726F7465637420456E6400";
    std::string binary_data = "558BECEB0F4F50726F7465637420426567696E00C70540664A0001000000EB0D4F50726F7465637420456E64008BE55DC3558BEC81EC10000000C745FC00000000C745F800000000EB0F4F50726F7465637420426567696E006AFF6A0868160001166801000152E83702000083C4108945F468040000806A008B45F485C07505B8541B4800506801000000BB30134000E80202000083C4108945F08B5DF485DB740953E8F501000083C4048B45F0A340664A00C745FC0000000033C98D45FC8BD8415153890B83F9050F8F1E000000837DFC050F850D000000E8470000008945F8E907000000FF45FC5B59EBD483C40868020000806A00FF75F86801000000BB30134000E88E01000083C41050E85F000000EB0D4F50726F7465637420456E64008BE55DC3558BEC81EC04000000EB0F4F50726F7465637420426567696E00813D40664A0039433836B8000000000F94C0E90F000000EB0D4F50726F7465637420456E64008BE55DC35E6A004B75FBFFE6558BEC81EC04000000EB0F4F50726F7465637420426567696E00837D08010F856C000000BB06000000E8CAFFFFFF68010300806A00684000000068040000806A0068551B48006804000000BB10154000E8D100000083C43468020000806A0068010000006A006A006A006801000100682000010668210001526803000000BBB0124000E89E00000083C428E934000000BB06000000E85EFFFFFF68010300806A00681000000068040000806A0068591B48006804000000BB10154000E86500000083C434EB0D4F50726F7465637420456E64008BE55DC20400";

    std::string pattern = sdk_begin_sign + "(.*?)" + sdk_end_sign;
    
    auto match = regex_search(binary_data, pattern);

    return 0;
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值