简单高效解析形如[scheme://]host[:port][path][?query][#fragment]的uri.
【头文件】
struct URI
{
std::string scheme; // protocol string(http/https/ftp/...)
std::string host; // host string(domain or ip)
ushort port; // port number
std::string path; // path string(e.g. /index.html)
std::string query; // query string(e.g. a=1&b=2)
std::string fragment; // additional identifying information
URI() { Clear(); }
void Clear()
{
scheme = "http"; // default "http"
host.clear(); // default ""
port = 0; // default 0
path = "/"; // default "/"
query.clear(); // default ""
fragment.clear(); // default ""
}
};
/// @brief 解析形如[scheme://]host[:port][path][?query][#fragment]的uri
/// @param[in] uristr 需要解析的uri串
/// @param[out] uri 解析结果
/// @return 解析成功返回true,否则返回false
/// @attention 解析失败时,出参未定义
bool ParseUri(const string& uristr, URI& uri);
【实现文件】
bool ParseUri(const string& uristr, URI& uri)
{
const char* begin = uristr.c_str();
const char* end = uristr.c_str() + uristr.size();
const char* ptr = begin;
uri.Clear();
parse_host:
char c = 0;
while ((c = *ptr) != 0 and c != '.' and c != ':')
++ptr;
if (0 == c)
return false;
if (':' == c) // find the scheme
{
// find "//" after ":"
if (end - ptr <= 3)
return false;
if (*(ptr + 1) != '/' or *(ptr + 2) != '/')
return false;
uri.scheme.assign(begin, ptr - begin);
begin = (ptr += 3); // sikp to host begin
goto parse_host;
}
// find the host end
while ((c = *ptr) != 0 and c != ':' and c != '/' and c != '?' and c != '#')
++ptr;
uri.host.assign(begin, ptr - begin);
if (':' == c) // find the port
{
if ((c = *(++ptr)) < '1' or c > '9')
return false; // should be number(and should be > 0)
ushort port = 0;
for (; (c = *ptr) >= '0' and c <= '9'; ++ptr)
port = (port << 3) + (port << 1) + (c ^ '0'); // auto var may be faster than uri.port
uri.port = port;
}
if ('/' == c) // find the path
{
for (begin = ptr; (c = *ptr) != 0 and c != '?' and c != '#'; )
++ptr;
uri.path.assign(begin, ptr - begin);
}
if ('?' == c) // find the query_string
{
for (begin = ++ptr; (c = *ptr) != 0 and c != '#'; )
++ptr;
uri.query.assign(begin, ptr - begin);
}
if ('#' == c) // find the fragment
uri.fragment.assign(ptr + 1, end - ptr - 1);
return true;
}
【单元测试】
URI uri;
// full content
string uristr = "ftp://www.baidu.com:21/abc.txt?a=b&c=3#frag=10,20";
ASSERT_TRUE(ParseUri(uristr, uri));
ASSERT_EQ("ftp", uri.scheme);
ASSERT_EQ("www.baidu.com", uri.host);
ASSERT_EQ(21, uri.port);
ASSERT_EQ("/abc.txt", uri.path);
ASSERT_EQ("a=b&c=3", uri.query);
ASSERT_EQ("frag=10,20", uri.fragment);
// no scheme
uristr = "www.a-b-c.com:8080/def.vim?xy=12#xxx";
ASSERT_TRUE(ParseUri(uristr, uri));
ASSERT_EQ("http", uri.scheme);
ASSERT_EQ("www.a-b-c.com", uri.host);
ASSERT_EQ(8080, uri.port);
ASSERT_EQ("/def.vim", uri.path);
ASSERT_EQ("xy=12", uri.query);
ASSERT_EQ("xxx", uri.fragment);
// no port
uristr = "http://www.baidu.com/abc.xyz?ab=cd#yyy";
ASSERT_TRUE(ParseUri(uristr, uri));
ASSERT_EQ("http", uri.scheme);
ASSERT_EQ("www.baidu.com", uri.host);
ASSERT_EQ(0, uri.port);
ASSERT_EQ("/abc.xyz", uri.path);
ASSERT_EQ("ab=cd", uri.query);
ASSERT_EQ("yyy", uri.fragment);
// no path
uristr = "http://www.baidu.com:8080?m=5#zzz";
ASSERT_TRUE(ParseUri(uristr, uri));
ASSERT_EQ("http", uri.scheme);
ASSERT_EQ("www.baidu.com", uri.host);
ASSERT_EQ(8080, uri.port);
ASSERT_EQ("/", uri.path);
ASSERT_EQ("m=5", uri.query);
ASSERT_EQ("zzz", uri.fragment);
// no query
uristr = "http://www.qq.com:8888/def.txt#x=y";
ASSERT_TRUE(ParseUri(uristr, uri));
ASSERT_EQ("http", uri.scheme);
ASSERT_EQ("www.qq.com", uri.host);
ASSERT_EQ(8888, uri.port);
ASSERT_EQ("/def.txt", uri.path);
ASSERT_EQ("", uri.query);
ASSERT_EQ("x=y", uri.fragment);
// no fragment
uristr = "http://www.soso.com:1234/abc.html?x=y";
ASSERT_TRUE(ParseUri(uristr, uri));
ASSERT_EQ("http", uri.scheme);
ASSERT_EQ("www.soso.com", uri.host);
ASSERT_EQ(1234, uri.port);
ASSERT_EQ("/abc.html", uri.path);
ASSERT_EQ("x=y", uri.query);
ASSERT_EQ("", uri.fragment);
// no scheme & port
uristr = "www.sohu.com/?x=y";
ASSERT_TRUE(ParseUri(uristr, uri));
ASSERT_EQ("http", uri.scheme);
ASSERT_EQ("www.sohu.com", uri.host);
ASSERT_EQ(0, uri.port);
ASSERT_EQ("/", uri.path);
ASSERT_EQ("x=y", uri.query);
ASSERT_EQ("", uri.fragment);
// no scheme & path
uristr = "www.baidu.com:8080?p=345";
ASSERT_TRUE(ParseUri(uristr, uri));
ASSERT_EQ("http", uri.scheme);
ASSERT_EQ("www.baidu.com", uri.host);
ASSERT_EQ(8080, uri.port);
ASSERT_EQ("/", uri.path);
ASSERT_EQ("p=345", uri.query);
ASSERT_EQ("", uri.fragment);
// no scheme & query
uristr = "www.baidu.com:8080/abc.txt";
ASSERT_TRUE(ParseUri(uristr, uri));
ASSERT_EQ("http", uri.scheme);
ASSERT_EQ("www.baidu.com", uri.host);
ASSERT_EQ(8080, uri.port);
ASSERT_EQ("/abc.txt", uri.path);
ASSERT_EQ("", uri.query);
ASSERT_EQ("", uri.fragment);
// no port & path
uristr = "https://www.baidu.com?x=3";
ASSERT_TRUE(ParseUri(uristr, uri));
ASSERT_EQ("https", uri.scheme);
ASSERT_EQ("www.baidu.com", uri.host);
ASSERT_EQ(0, uri.port);
ASSERT_EQ("/", uri.path);
ASSERT_EQ("x=3", uri.query);
ASSERT_EQ("", uri.fragment);
// no port & query
uristr = "https://www.baidu.com/def.html";
ASSERT_TRUE(ParseUri(uristr, uri));
ASSERT_EQ("https", uri.scheme);
ASSERT_EQ("www.baidu.com", uri.host);
ASSERT_EQ(0, uri.port);
ASSERT_EQ("/def.html", uri.path);
ASSERT_EQ("", uri.query);
ASSERT_EQ("", uri.fragment);
// no path & query
uristr = "https://www.google.com:12345";
ASSERT_TRUE(ParseUri(uristr, uri));
ASSERT_EQ("https", uri.scheme);
ASSERT_EQ("www.google.com", uri.host);
ASSERT_EQ(12345, uri.port);
ASSERT_EQ("/", uri.path);
ASSERT_EQ("", uri.query);
ASSERT_EQ("", uri.fragment);
// no scheme & port & path
uristr = "www.baidu.com?z=5";
ASSERT_TRUE(ParseUri(uristr, uri));
ASSERT_EQ("http", uri.scheme);
ASSERT_EQ("www.baidu.com", uri.host);
ASSERT_EQ(0, uri.port);
ASSERT_EQ("/", uri.path);
ASSERT_EQ("z=5", uri.query);
ASSERT_EQ("", uri.fragment);
// no scheme & port & query
uristr = "www.baidu.com/index.htm";
ASSERT_TRUE(ParseUri(uristr, uri));
ASSERT_EQ("http", uri.scheme);
ASSERT_EQ("www.baidu.com", uri.host);
ASSERT_EQ(0, uri.port);
ASSERT_EQ("/index.htm", uri.path);
ASSERT_EQ("", uri.query);
ASSERT_EQ("", uri.fragment);
// no scheme & path & query
uristr = "www.yahoo.com:1234";
ASSERT_TRUE(ParseUri(uristr, uri));
ASSERT_EQ("http", uri.scheme);
ASSERT_EQ("www.yahoo.com", uri.host);
ASSERT_EQ(1234, uri.port);
ASSERT_EQ("/", uri.path);
ASSERT_EQ("", uri.query);
ASSERT_EQ("", uri.fragment);
// no port & path & query
uristr = "ftp://www.baidu.com";
ASSERT_TRUE(ParseUri(uristr, uri));
ASSERT_EQ("ftp", uri.scheme);
ASSERT_EQ("www.baidu.com", uri.host);
ASSERT_EQ(0, uri.port);
ASSERT_EQ("/", uri.path);
ASSERT_EQ("", uri.query);
ASSERT_EQ("", uri.fragment);
// no scheme & port & path & query
uristr = "www.360.com";
ASSERT_TRUE(ParseUri(uristr, uri));
ASSERT_EQ("http", uri.scheme);
ASSERT_EQ("www.360.com", uri.host);
ASSERT_EQ(0, uri.port);
ASSERT_EQ("/", uri.path);
ASSERT_EQ("", uri.query);
ASSERT_EQ("", uri.fragment);
// invalid urls
ASSERT_FALSE(ParseUri("www", uri));
ASSERT_FALSE(ParseUri("host:80", uri));
ASSERT_FALSE(ParseUri("http://", uri));
ASSERT_FALSE(ParseUri("http:/|a.b.com", uri));
ASSERT_FALSE(ParseUri("http://qq.com:023", uri));