天网搜索TSE部分源码分析-url.cpp

最新推荐文章于 2013-01-24 23:54:55 发布

最新推荐文章于 2013-01-24 23:54:55 发布 · 171 阅读

文章标签：

#Scheme #HP #Cache #Socket

搜索引擎专栏收录该内容

147 篇文章

订阅专栏

本文介绍了一个C++类CUrl的设计与实现，该类用于解析和处理URL，包括获取URL的各个组成部分如协议、主机名、路径等，并提供了一系列辅助函数如验证主机名有效性、获取IP地址等。

//根据一个给定的URL，组成消息体，发送给该URL指向的服务器。

//为此，定义Url类

//url.cpp

#include<iostream>
#include <string>
#include <sys/socket.h>
#include <netdb.h>

#include "Tse.h"
#include "Url.h"
#include "Http.h"
#include "Md5.h"
#include "StrFun.h"

//对网址的"."分隔符进行判断，主要分析是否以"."或".."结束
#define DOTP(x) ((*(x) == '.') &&(!*(x + 1)))

#define DDOTP(x) ((*(x) == '.') &&(*(x + 1) == '.') && (!*(x +2)))

map<string,string>mapCacheHostLookup;
extern vector<string>vsUnreachHost;
pthread_mutex_t mutexCacheHost = PTHREAD_MUTEX_INITIALIZER;
extern set<string>setVisitedUrlMD5;
extern map<unsigned long,unsignedlong> mapIpBlock;
typedefmap<string,string>::value_typevalTypeCHL;

//定义连接类型结构

struct scheme_data
{
char *leading_string;//连接头字符
int default_port;//默认端口
int enabled;//允许连接否
};

//所有连接情况的定义
static struct scheme_data supported_schemes[] =
{
{ "http://",DEFAULT_HTTP_PORT, 1 },
{ "ftp://",DEFAULT_FTP_PORT, 1},

{NULL,-1,0 }
};

//分析并填充连接类型

void CUrl::ParseScheme (const char *url)
{
int i;

for (i = 0;supported_schemes[i].leading_string; i++)

if (0 ==strncasecmp (url, supported_schemes[i].leading_string,
strlen (supported_schemes[i].leading_string))) {

　//判断连接类型，并更新m_eScheme成员

　　if (supported_schemes[i].enabled){
this->m_eScheme= (enum url_scheme) i;
return;
}else{
this->m_eScheme= SCHEME_INVALID;
return;
}
}

this->m_eScheme= SCHEME_INVALID;
return;
}

bool CUrl::ParseUrlEx(string strUrl)
{
char protocol[10];
char host[HOST_LEN];
char request[256];
int port = -1;

//初始化相关填充区域

memset( protocol, 0,sizeof(protocol) );
memset( host, 0, sizeof(host) );
memset( request, 0, sizeof(request) );

//分析并填充连接类型

this->ParseScheme(strUrl.c_str());

//如果非HTTP类型，返回FALSE
if( this->m_eScheme != SCHEME_HTTP){
return false;
}

//如果是HTTP类型，ParseUrlEx继续处理，注意每个参数都有传送它的长度

ParseUrlEx(strUrl.c_str(),
protocol,sizeof(protocol),
host,sizeof(host),
request,sizeof(request),
&port);

//将取出后的URL的相关信息写入成员变量中

m_sUrl =strUrl;
m_sHost = host;
m_sPath = request;

if( port > 0){
m_nPort = port;
}

return true;
}

//对URL进一步处理

void CUrl::ParseUrlEx(const char *url,
char *protocol, intlprotocol,
char *host, int lhost,
char *request, intlrequest,
int *port)
{
char *work,*ptr,*ptr2;

*protocol = *host = *request =0;
*port = 80;

//准备一个临时缓冲区WORK

int len = strlen(url);
//pthread_mutex_lock(&mutexMemory);
work = new char[len + 1];
//pthread_mutex_unlock(&mutexMemory);
memset(work, 0, len+1);
strncpy(work, url, len);

//依据":"找到协议，如果URL内有协议类型，则写入protocol，否则默认为HTTP

// find protocol if any
ptr = strchr(work, ':');
if( ptr != NULL ){
*(ptr++) = 0;//以免lprotocol比work中的协议字符的长度还要长
strncpy( protocol, work,lprotocol );
} else

{//有些URL省略了HTTP的前缀
strncpy( protocol, "HTTP",lprotocol );
ptr = work;
}

//跳过"//"

// skip past opening /'s
if( (*ptr=='/')&& (*(ptr+1)=='/') )
ptr+=2;

//使用ptr2处理主机

// find host
ptr2 = ptr;
while( IsValidHostChar(*ptr2)&& *ptr2 )
ptr2++;
*ptr2 = 0;//保证复制主机字符的正确性和安全性
strncpy( host, ptr, lhost );

//处理请求部分的字符

// find the request
int offset = ptr2 - work;
const char *pStr = url + offset;

//PSTR指向请求部分的起始部分通过offset偏移地址
strncpy( request, pStr, lrequest );

//处理端口号

// find the port number, ifany
ptr = strchr( host, ':' );
if( ptr != NULL ){
*ptr = 0;
*port = atoi(ptr+1);
}

//pthread_mutex_lock(&mutexMemory);
delete [] work;
//pthread_mutex_unlock(&mutexMemory);
work = NULL;
}

CUrl::CUrl()
{

//不带参数的构造函数初始化类成员
this->m_sUrl = "";
this->m_eScheme=SCHEME_INVALID;

this->m_sHost ="";
this->m_nPort =DEFAULT_HTTP_PORT;

this->m_sPath = "";

}

CUrl::~CUrl()
{

}

//通过主机字符得到IP地址
char * CUrl::GetIpByHost(const char *host)
{
if( !host ){// nullpointer
return NULL;
}

if( !IsValidHost(host) ){//invalid host
return NULL;
}

unsigned long inaddr = 0;
char *result = NULL;
int len = 0;

//主机转32位二进制数的IP地址
inaddr = (unsigned long)inet_addr( host );
//if ( (int)inaddr != -1){
if ( inaddr != INADDR_NONE){ // host is justip

//＝＝＝主机名如果为XX.XX.XX.XX形式的IP地址字符，则inet_addr执行成功
len = strlen(host);
//pthread_mutex_lock(&mutexMemory);
result = new char[len+1];
//pthread_mutex_unlock(&mutexMemory);
memset(result, 0, len+1);
memcpy(result, host, len);

return result;

}

else

{
//firt find from cache
//＝＝主机名字符为非IP形式

//Cache暂存的内存以主机名为索引查找IP地址
map<string,string>::iteratorit = mapCacheHostLookup.find(host);

if( it !=mapCacheHostLookup.end() ){// find in host lookupcache
const char *strHostIp;

strHostIp= (*it).second.c_str();

inaddr =(unsigned long)inet_addr( strHostIp );
//if ((int)inaddr != -1){
if ( inaddr!= INADDR_NONE ){

//成功在Cache中找到主机名对应的IP地址
len= strlen(strHostIp);
//pthread_mutex_lock(&mutexMemory);
result= new char[len+1];
//pthread_mutex_unlock(&mutexMemory);
memset(result, 0, len+1 );
memcpy(result, strHostIp, len );

//cout<< ":)" ;

returnresult;
}
}
}

//均未找到，只能求帮助于DNS服务

// if still not find, then try by DNSserver
struct hostent *hp;
hp =gethostbyname(host);//通过主机名获得IP地址
if(hp == NULL) {
//cout<< "gethostbyname() error inGetIpByHost: " << host<< endl;
return NULL;
}

// cache host lookup

//in为32位的IP地址结构变量
struct in_addr in;

bcopy(*(hp->h_addr_list),(caddr_t)&in, hp->h_length);
//inet_ntoa()是对structin_addr*结构转变成可以答应的ip的点进字

//符串，inet_ntop也是同样的功能，不过inet_ntop是使用于ipv4,ipv6

//而inet_ntoa只能用于ipv4

//AF_INET表示为IP地址形式即INTERNET地址家族

charabuf[INET_ADDRSTRLEN];

//将IN中的地址转成带点的IP地址字符形式
if( inet_ntop(AF_INET, (void *)&in,abuf,sizeof(abuf)) == NULL ){
cout<< "inet_ntop() return error inGetIpByHost" << endl;
return NULL;

} else {

//可以成功转化成字符形式的IP写入abuf

pthread_mutex_lock(&mutexCacheHost);
//if(mapCacheHostLookup.count(host) == 0){
//更新CACHE中的主机和IP地址对

if( mapCacheHostLookup.find(host) == mapCacheHostLookup.end()){

//cout<< endl<< host<< " and "<< abuf<< endl;
mapCacheHostLookup.insert(valTypeCHL ( host, abuf));
}
pthread_mutex_unlock(&mutexCacheHost);

}

// return result
len = strlen(abuf);
//pthread_mutex_lock(&mutexMemory);
result = new char[len + 1];
//pthread_mutex_unlock(&mutexMemory);
memset( result, 0, len+1 );
memcpy( result, abuf, len );

return result;
}

bool CUrl::IsValidHostChar(char ch)
{

//所有数字，字母，-,.,:,_为主机字符可接受部分
return( isalpha(ch) || isdigit(ch)
|| ch=='-' || ch=='.' ||ch==':' || ch=='_');
}

bool CUrl::IsValidHost(const char *host)
{
if( !host ){
return false;
}

if( strlen(host) < 6 ){ // incase host like "www", "pku", etc.
return false;
}

char ch;
for(unsigned int i=0;i<strlen(host); i++){
ch = *(host++);
if( !IsValidHostChar(ch)){
returnfalse;
}
}

return true;
}

bool CUrl::IsVisitedUrl(const char *url)
{
if( !url ){
return true; // if be null, wethink it have been visited
}

CMD5 iMD5;
iMD5.GenerateMD5( (unsigned char*)url,strlen(url) );
string strDigest = iMD5.ToString();

if( setVisitedUrlMD5.find(strDigest) !=setVisitedUrlMD5.end() ) {
return true;
} else {
return false;
}

}

bool CUrl::IsValidIp(const char *ip)
{
if( ip == NULL ){
return false;
}

unsigned long inaddr = (unsignedlong)inet_addr(ip);
if( inaddr == INADDR_NONE ){//invalid ip
return false;
}

if( mapIpBlock.size() > 0){
map<unsignedlong,unsigned long>::iterator pos;
for(pos=mapIpBlock.begin();pos!=mapIpBlock.end(); ++pos){
unsigned longret;

ret =inaddr & ~((*pos).second);
if( ret ==(*pos).first ){// inside
returntrue;
}
}

// outside
return false;
}

// if block range is not given, we think itinside also
return true;
}

bool CUrl::IsForeignHost(string host)
{
if( host.empty() ) return true;
if( host.size() > HOST_LEN )return true;

unsigned long inaddr = 0;

inaddr = (unsigned long)inet_addr(host.c_str() );
if ( inaddr != INADDR_NONE){ // host is justip
return false;
}

string::size_type idx = host.rfind('.');
string tmp;
if( idx != string::npos ){
tmp = host.substr(idx+1);
}

CStrFun::Str2Lower( tmp, tmp.size() );
const char *home_host[] ={
"cn","com","net","org","info",
"biz","tv","cc", "hk","tw"
};

int home_host_num = 10;

for(int i=0; i<home_host_num;i++){
if( tmp == home_host[i] )
returnfalse;
}

return true;
}

bool CUrl::IsImageUrl(string url)
{
if( url.empty() ) return false;
if( url.size() > HOST_LEN ) returnfalse;

string::size_type idx = url.rfind('.');
string tmp;
if( idx != string::npos ){
tmp = url.substr(idx+1);
}