0. 起因
今天是2025年3月15日的0点34分,我的舍友此时在王者荣耀双排开黑,而此时的我既十分惊讶又十分后悔。惊讶是因为我发现了新的二进制正则匹配办法,而且这个办法比我之前想到的还要好,后悔就后悔我居然今天才想出。
1. 第一种办法
我之前想到的第一种办法是:硬解析二进制
#include <boost/regex.hpp>
// 2025年3月11日: 经过排查,std::regex只支持有符号型,即根本不支持uint8_t
struct _Range {
size_t start_pos;
size_t end_pos;
};
using MatchItem = std::vector<std::pair<std::basic_string_view<std::uint8_t>, _Range>>;
using MatchList = std::vector<MatchItem>;
std::basic_string<uint8_t> bin_escape_seq(const std::basic_string<uint8_t>& str) {
const std::regex specialCharsRegex(R"([.^$*+?()\[\]{}|\\])");
auto res = std::regex_replace(std::string(str.begin(), str.end()), specialCharsRegex, R"(\$&)");
return std::basic_string<uint8_t>(res.begin(), res.end());
}
std::basic_string<uint8_t> bin_replace(std::basic_string_view<uint8_t> str, std::basic_string_view<uint8_t> regex, std::basic_string_view<uint8_t> replaceStr) {
std::regex pattern(std::string(regex.begin(), regex.end()));
auto res = std::regex_replace(std::string(str.begin(), str.end()), pattern, std::string(replaceStr.begin(), replaceStr.end()));
return std::basic_string<uint8_t>(res.begin(), res.end());
}
bool bin_match(std::basic_string_view<uint8_t> str, std::basic_string_view<uint8_t> regex) {
return std::regex_match(std::string(str.begin(), str.end()), std::regex(std::string(regex.begin(), regex.end())));
}
MatchList bin_search(std::basic_string_view<std::uint8_t> str, std::basic_string_view<uint8_t> regex) {
MatchList ret{};
auto begin = str.cbegin();
auto end = str.cend();
std::match_results<decltype(str)::const_iterator> m;
std::regex r(std::string(regex.begin(), regex.end()));
while (std::regex_search(begin, end, m, r)) {
begin = m[0].second;
MatchItem item{};
for (size_t i = 0; i < m.size(); i++) {
_Range pos{
static_cast<size_t>(m[i].first - str.begin()),
static_cast<size_t>(m[i].second - str.begin()),
};
size_t sub_size = static_cast<size_t>(m[i].second - m[i].first);
decltype(str) sub = str.substr(pos.start_pos, sub_size);
item.emplace_back(sub, pos);
}
ret.push_back(item);
}
return ret;
}
就是这样的代码,如此直接,把二进制的数据硬直直地带入到正则匹配中,且我也想到regex模式串可能会存在需要转义的情况,因此就搭配bin_escape_seq()
来实现转义
然后真实的应用场景就是这样。。。
std::basic_string<uint8_t> sdk_begin_sign = bin_escape_seq({ 0xEB,0x0F,0x4F,0x50,0x72,0x6F,0x74,0x65,0x63,0x74,0x20,0x42,0x65,0x67,0x69,0x6E,0x00 });
std::basic_string<uint8_t> sdk_end_sign = bin_escape_seq({ 0xEB,0x0D,0x4F,0x50,0x72,0x6F,0x74,0x65,0x63,0x74,0x20,0x45,0x6E,0x64,0x00 });
std::basic_string<uint8_t> sec_content(sec.content().begin(), sec.content().end());
auto matchs = bin_search(sec_content, sdk_begin_sign + std::basic_string<uint8_t>((const uint8_t*)"(.*)") + sdk_end_sign);
for (auto& match : matchs){
func.begin = off + match[0].second.start_pos;
func.end = off + match[0].second.end_pos;
func.size = func.end - func.begin;
result.push_back(func);
}
我来总结下这种办法的优缺点
优点:
- 方便直接,省空间
缺点:
- 模式串在代码里特别不好看
- 正则匹配容易出现问题(有错误和漏缺)
- 模式串还有0x00截断
总结下来就是,太姬芭沙比了
2. 第二种办法
灵感是从 ghidra 和 ida 想到的
我们都说,正则匹配一般是为可见字符(也可以叫"可打印字符")服务的,像上面这种硬是给不可见字符(也可以叫"不可打印字符")服务,不仅正则引擎嫌难受,我们看着用着也难受
要想得到最有效的结果,基本的原则自然就是要与正则引擎"门当户对"。可见字符是吧,我们直接把二进制数据解析成可见字符不就行了?
把一个个字节变成一个个可见字符
,就可以了
比如有一个字节 0x3C
,我们将其解析成两个'3'
和'C'
字符,换成ASCII码形式就是0x33,0x43
一个字节总共就有16种字符标识,0
、1
、2
、3
、4
、5
、6
、7
、8
、9
、A
、B
、C
、D
、E
、F
就这么简单,16个字符里没有一个需要特殊转义的,用就行了,而且在代码里边看的也非常顺(要是嫌太长,自己写个宏、保存到文件里再读取都行)
我想到的唯一的缺点,就是太占内存 空间了,一个字节就是两个十六进制字符,那么将这两个十六进制字符保存本身就需要两个字节的长度,也就是说,如果要匹配100字节的数据,你就得准备200字节的空间
功能:
struct _Range {
size_t start_pos;
size_t end_pos;
};
using MatchItem = std::vector<std::pair<std::string_view, _Range>>;
using MatchList = std::vector<MatchItem>;
std::string regex_escape_seq(std::string_view str) {
const std::regex _regex(R"([.^$*+?()\[\]{}|\\])");
return std::regex_replace(std::string(str), _regex, R"(\$&)");
}
std::string regex_replace(std::string_view str, std::string_view regex, std::string_view replaceStr) {
std::regex pattern(std::string(regex.begin(), regex.end()));
return std::regex_replace(std::string(str), pattern, std::string(replaceStr));
}
bool regex_match(std::string_view str, std::string_view regex) {
return std::regex_match(std::string(str), std::regex(std::string(regex)));
}
MatchList regex_search(std::string_view str, std::string_view regex) {
MatchList ret{};
auto begin = str.cbegin();
auto end = str.cend();
std::match_results<decltype(str)::const_iterator> m;
std::regex r(std::string(regex.begin(), regex.end()));
while (std::regex_search(begin, end, m, r)) {
begin = m[0].second;
MatchItem item{};
for (size_t i = 0; i < m.size(); i++) {
_Range pos{
static_cast<size_t>(m[i].first - str.begin()),
static_cast<size_t>(m[i].second - str.begin()),
};
size_t sub_size = static_cast<size_t>(m[i].second - m[i].first);
decltype(str) sub = str.substr(pos.start_pos, sub_size);
item.emplace_back(sub, pos);
}
ret.push_back(item);
}
return ret;
}
std::string bytes2String(uint8_t bytehex, bool uppercase) {
const char* hex_chars = uppercase ? "0123456789ABCDEF" : "0123456789abcdef";
std::string result(2, '\0');
result[0] = hex_chars[(bytehex >> 4) & 0xF];
result[1] = hex_chars[bytehex & 0xF];
return result;
}
std::string convertByteString(const std::basic_string<uint8_t>& bytes, bool uppercase = true) {
std::string result;
result.reserve(bytes.size() * 2);
for (const auto& byte : bytes) {
result += bytes2String(byte, uppercase);
}
return result;
}
std::basic_string<uint8_t> convertBytes(const std::string& hexStr) {
if (hexStr.length() % 2 != 0) {
throw std::invalid_argument("Hex string length must be even");
}
std::basic_string<uint8_t> result;
result.reserve(hexStr.length() / 2);
for (size_t i = 0; i < hexStr.length(); i += 2) {
char highNibble = hexStr[i];
char lowNibble = hexStr[i + 1];
uint8_t high;
if (highNibble >= '0' && highNibble <= '9') {
high = (highNibble - '0') << 4;
}
else if (highNibble >= 'A' && highNibble <= 'F') {
high = (highNibble - 'A' + 10) << 4;
}
else if (highNibble >= 'a' && highNibble <= 'f') {
high = (highNibble - 'a' + 10) << 4;
}
else {
throw std::invalid_argument("Invalid hex character");
}
uint8_t low;
if (lowNibble >= '0' && lowNibble <= '9') {
low = lowNibble - '0';
}
else if (lowNibble >= 'A' && lowNibble <= 'F') {
low = lowNibble - 'A' + 10;
}
else if (lowNibble >= 'a' && lowNibble <= 'f') {
low = lowNibble - 'a' + 10;
}
else {
throw std::invalid_argument("Invalid hex character");
}
result.push_back(high | low);
}
return result;
}
测试代码:
int main() {
std::string _sdk_begin_sign = std::string() + '\xEB' + '\x0F' + '\x4F' + '\x50' + '\x72' + '\x6F' + '\x74' + '\x65' + '\x63' + '\x74' + '\x20' + '\x42' + '\x65' + '\x67' + '\x69' + '\x6E' + '\x00';
std::string _sdk_end_sign = std::string() + '\xEB' + '\x0D' + '\x4F' + '\x50' + '\x72' + '\x6F' + '\x74' + '\x65' + '\x63' + '\x74' + '\x20' + '\x45' + '\x6E' + '\x64' + '\x00';
std::string sdk_begin_sign = "EB0F4F50726F7465637420426567696E00";
std::string sdk_end_sign = "EB0D4F50726F7465637420456E6400";
std::string binary_data
std::string pattern = sdk_begin_sign + "(.*?)" + sdk_end_sign;
auto match = regex_search(binary_data, pattern);
return 0;
}