天网搜索TSE部分源码分析-url.cpp

本文介绍了一个URL处理类的实现细节,包括如何解析URL、获取主机IP、验证主机及URL的有效性等。通过C++代码展示了从字符串中提取协议、主机、路径和端口的过程。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

//根据一个给定的URL,组成消息体,发送给该URL指向的服务器。

//为此,定义Url类
//url.cpp

#include<iostream>
#include <string>
#include <sys/socket.h>
#include <netdb.h>

#include "Tse.h"
#include "Url.h"
#include "Http.h"
#include "Md5.h"
#include "StrFun.h"

//对网址的"."分隔符进行判断,主要分析是否以"."或".."结束
#define DOTP(x) ((*(x) == '.') &&(!*(x + 1)))

#define DDOTP(x) ((*(x) == '.') &&(*(x + 1) == '.') && (!*(x +2)))

 

map<string,string>mapCacheHostLookup;
extern vector<string>vsUnreachHost;
pthread_mutex_t mutexCacheHost = PTHREAD_MUTEX_INITIALIZER;
extern set<string>setVisitedUrlMD5;
extern map<unsigned long,unsignedlong> mapIpBlock;
typedefmap<string,string>::value_typevalTypeCHL;

 

//定义连接类型结构

struct scheme_data
{
 char *leading_string;//连接头字符
 int default_port;//默认端口
 int enabled;//允许连接否
};

 

//所有连接情况的定义
static struct scheme_data supported_schemes[] =
{
 { "http://", DEFAULT_HTTP_PORT,  1 },
 { "ftp://",  DEFAULT_FTP_PORT,   1},

 
 {NULL,      -1,                0 }
};


//分析并填充连接类型

void CUrl::ParseScheme (const char *url)
{
 int i;

 for (i = 0;supported_schemes[i].leading_string; i++)

  if (0 ==strncasecmp (url, supported_schemes[i].leading_string,
                         strlen (supported_schemes[i].leading_string))) {

    //判断连接类型,并更新m_eScheme成员

  if (supported_schemes[i].enabled){
    this->m_eScheme= (enum url_scheme) i;
    return;
   }else{
    this->m_eScheme= SCHEME_INVALID;
    return;
   }
  }

 this->m_eScheme= SCHEME_INVALID;
 return;
}


bool CUrl::ParseUrlEx(string strUrl)
{
 char protocol[10];
 char host[HOST_LEN];
 char request[256];
 int port = -1;

//初始化相关填充区域

 memset( protocol, 0,sizeof(protocol) );
 memset( host, 0, sizeof(host) );
 memset( request, 0, sizeof(request) );

//分析并填充连接类型

 this->ParseScheme(strUrl.c_str());

//如果非HTTP类型,返回FALSE
 if( this->m_eScheme != SCHEME_HTTP){
  return false;
 }

 //如果是HTTP类型,ParseUrlEx继续处理,注意每个参数都有传送它的长度

 ParseUrlEx(strUrl.c_str(),
   protocol,sizeof(protocol),
   host,sizeof(host),
   request,sizeof(request),
   &port);

//将取出后的URL的相关信息写入成员变量中

 m_sUrl  =strUrl;
 m_sHost = host;
 m_sPath = request;

 if( port > 0){
  m_nPort = port;
 }

 return true;
}


//对URL进一步处理
 

void CUrl::ParseUrlEx(const char *url,
  char *protocol, intlprotocol,
  char *host, int lhost,
  char *request, intlrequest,
  int *port)
{
 char *work,*ptr,*ptr2;

 *protocol = *host = *request =0;
 *port = 80;

 //准备一个临时缓冲区WORK

 int len = strlen(url);
 //pthread_mutex_lock(&mutexMemory);
 work = new char[len + 1];
 //pthread_mutex_unlock(&mutexMemory);
 memset(work, 0, len+1);
 strncpy(work, url, len);

 //依据":"找到协议,如果URL内有协议类型,则写入protocol,否则默认为HTTP

 // find protocol if any
 ptr = strchr(work, ':');
 if( ptr != NULL ){
  *(ptr++) = 0;//以免lprotocol比work中的协议字符的长度还要长
  strncpy( protocol, work,lprotocol );
 } else

{//有些URL省略了HTTP的前缀
  strncpy( protocol, "HTTP",lprotocol );
  ptr = work;
 }

//跳过"//"

 // skip past opening /'s
 if( (*ptr=='/')&& (*(ptr+1)=='/') )
  ptr+=2;

//使用ptr2处理主机

 // find host
 ptr2 = ptr;
 while( IsValidHostChar(*ptr2)&& *ptr2 )
  ptr2++;
 *ptr2 = 0;//保证复制主机字符的正确性和安全性
 strncpy( host, ptr, lhost );

 

 //处理请求部分的字符

 // find the request
 int offset = ptr2 - work;
 const char *pStr = url + offset;

//PSTR指向请求部分的起始部分通过offset偏移地址
 strncpy( request, pStr, lrequest );

 

//处理端口号

 // find the port number, ifany
 ptr = strchr( host, ':' );
 if( ptr != NULL ){
  *ptr = 0;
  *port = atoi(ptr+1);
 }

 //pthread_mutex_lock(&mutexMemory);
 delete [] work;
 //pthread_mutex_unlock(&mutexMemory);
 work = NULL;
}


CUrl::CUrl()
{

//不带参数的构造函数初始化类成员
 this->m_sUrl = "";
 this->m_eScheme=SCHEME_INVALID;
       
 this->m_sHost =""; 
 this->m_nPort =DEFAULT_HTTP_PORT;
       
 this->m_sPath = "";
 

}

CUrl::~CUrl()
{

}


//通过主机字符得到IP地址
char * CUrl::GetIpByHost(const char *host)
{
 if( !host ){ // nullpointer
  return NULL;
 }

 if( !IsValidHost(host) ){ //invalid host
  return NULL;
 }

 unsigned long inaddr = 0;
 char *result = NULL;
 int len = 0;

 //主机转32位二进制数的IP地址
 inaddr = (unsigned long)inet_addr( host );
 //if ( (int)inaddr != -1){
 if ( inaddr != INADDR_NONE){ // host is justip

 //===主机名如果为XX.XX.XX.XX形式的IP地址字符,则inet_addr执行成功
  len = strlen(host);
  //pthread_mutex_lock(&mutexMemory);
  result = new char[len+1];
  //pthread_mutex_unlock(&mutexMemory);
  memset(result, 0, len+1);
  memcpy(result, host, len);

  return result;

       }

else

{
  //firt find from cache
  //==主机名字符为非IP形式

 //Cache暂存的内存以主机名为索引查找IP地址
  map<string,string>::iteratorit  = mapCacheHostLookup.find(host);

  if( it !=mapCacheHostLookup.end() ){ // find in host lookupcache
   const char *strHostIp;

   strHostIp= (*it).second.c_str();

   inaddr =(unsigned long)inet_addr( strHostIp );
   //if ((int)inaddr != -1){
   if ( inaddr!= INADDR_NONE ){

 //成功在Cache中找到主机名对应的IP地址
    len= strlen(strHostIp);
    //pthread_mutex_lock(&mutexMemory);
    result= new char[len+1];
    //pthread_mutex_unlock(&mutexMemory);
    memset(result, 0, len+1 );
    memcpy(result, strHostIp, len );

    //cout<< ":)" ;
    
    returnresult;
         }
  }
 }

//均未找到,只能求助于DNS服务

 // if still not find, then try by DNSserver
 struct hostent *hp; 
 hp =gethostbyname(host);//通过主机名获得IP地址
 if(hp == NULL) {
  //cout<< "gethostbyname() error inGetIpByHost: " << host<< endl;
  return NULL;
 }

 // cache host lookup

//in为32位的IP地址结构变量
       struct  in_addr in;

 bcopy(*(hp->h_addr_list),(caddr_t)&in, hp->h_length);
 //inet_ntoa()是对structin_addr*结构转变成可以答应的ip的点进字

//符串,inet_ntop也是同样的功能,不过inet_ntop是使用于ipv4,ipv6

//而inet_ntoa只能用于ipv4


 //AF_INET表示为IP地址形式即INTERNET地址家族

char   abuf[INET_ADDRSTRLEN];

//将IN中的地址转成带点的IP地址字符形式
       if( inet_ntop(AF_INET, (void *)&in,abuf,sizeof(abuf)) == NULL ){
  cout<< "inet_ntop() return error inGetIpByHost" << endl;
  return NULL;

 } else {

//可以成功转化成字符形式的IP写入abuf

  pthread_mutex_lock(&mutexCacheHost);
  //if(mapCacheHostLookup.count(host) == 0){
//更新CACHE中的主机和IP地址对  

if( mapCacheHostLookup.find(host) == mapCacheHostLookup.end()){

  
   //cout<< endl<< host<< " and "<< abuf<< endl;
   mapCacheHostLookup.insert(valTypeCHL ( host, abuf));
  }
  pthread_mutex_unlock(&mutexCacheHost);

 }

 // return result
 len = strlen(abuf);
 //pthread_mutex_lock(&mutexMemory);
 result = new char[len + 1];
 //pthread_mutex_unlock(&mutexMemory);
 memset( result, 0, len+1 );
 memcpy( result, abuf, len );

 return result;
}


bool CUrl::IsValidHostChar(char ch)
{

//所有数字,字母,-,.,:,_为主机字符可接受部分
 return( isalpha(ch) || isdigit(ch)
  || ch=='-' || ch=='.' ||ch==':' || ch=='_');
}


bool CUrl::IsValidHost(const char *host)
{
 if( !host ){
  return false;
 }

 if( strlen(host) < 6 ){ // incase host like "www", "pku", etc.
  return false;
 }

 char ch;
 for(unsigned int i=0;i<strlen(host); i++){
  ch = *(host++);
  if( !IsValidHostChar(ch)){
   returnfalse;
  }
 }

 return true;
}


bool CUrl::IsVisitedUrl(const char *url)
{
 if( !url ){
  return true; // if be null, wethink it have been visited
 }

 CMD5 iMD5;
 iMD5.GenerateMD5( (unsigned char*)url,strlen(url) );
 string strDigest = iMD5.ToString();

 if( setVisitedUrlMD5.find(strDigest) !=setVisitedUrlMD5.end() ) {
  return true;
 } else {
  return false;
 }

}



bool CUrl::IsValidIp(const char *ip)
{
 if( ip == NULL ){
  return false;
 }

 unsigned long inaddr = (unsignedlong)inet_addr(ip);
 if( inaddr == INADDR_NONE ){ //invalid ip
  return false;
 }

 if( mapIpBlock.size() > 0){
  map<unsignedlong,unsigned long>::iterator pos;
  for(pos=mapIpBlock.begin();pos!=mapIpBlock.end(); ++pos){
   unsigned longret;

   ret =inaddr & ~((*pos).second);
   if( ret ==(*pos).first ){ // inside
    returntrue;
   }
  }

  // outside
  return false;
 }

 // if block range is not given, we think itinside also
 return true;
}


bool CUrl::IsForeignHost(string host)
{
 if( host.empty() ) return true;
 if( host.size() > HOST_LEN )return true;

 unsigned long inaddr = 0;

 inaddr = (unsigned long)inet_addr(host.c_str() );
 if ( inaddr != INADDR_NONE){ // host is justip
  return false;
 }

 string::size_type idx = host.rfind('.');
 string tmp;
 if( idx != string::npos ){
  tmp = host.substr(idx+1);
 }

 CStrFun::Str2Lower( tmp, tmp.size() );
 const char *home_host[] ={
  "cn","com","net","org","info",
  "biz","tv","cc", "hk","tw"
 };

 int home_host_num = 10;

 for(int i=0; i<home_host_num;i++){
  if( tmp == home_host[i] )
   returnfalse;
 }

 return true;
}
 
 
bool CUrl::IsImageUrl(string url)
{
 if( url.empty() ) return false;
 if( url.size() > HOST_LEN ) returnfalse;

 string::size_type idx = url.rfind('.');
 string tmp;
 if( idx != string::npos ){
  tmp = url.substr(idx+1);
 }

 CStrFun::Str2Lower( tmp, tmp.size() );
 const char *image_type[] ={
  "gif","jpg","jpeg","png","bmp",
  "tif","psd"
 };

 int image_type_num = 7;

 for (int i=0; i<image_type_num;i++)
 {
  if( tmp == image_type[i])
   returntrue;
 }

 return false;
}
 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值