天网搜索TSE部分源码分析-url.cpp

本文介绍了一个C++类CUrl的设计与实现,该类用于解析和处理URL,包括获取URL的各个组成部分如协议、主机名、路径等,并提供了一系列辅助函数如验证主机名有效性、获取IP地址等。

//根据一个给定的URL,组成消息体,发送给该URL指向的服务器。

//为此,定义Url类
//url.cpp

#include<iostream>
#include <string>
#include <sys/socket.h>
#include <netdb.h>

#include "Tse.h"
#include "Url.h"
#include "Http.h"
#include "Md5.h"
#include "StrFun.h"

//对网址的"."分隔符进行判断,主要分析是否以"."或".."结束
#define DOTP(x) ((*(x) == '.') &&(!*(x + 1)))

#define DDOTP(x) ((*(x) == '.') &&(*(x + 1) == '.') && (!*(x +2)))

map<string,string>mapCacheHostLookup;
extern vector<string>vsUnreachHost;
pthread_mutex_t mutexCacheHost = PTHREAD_MUTEX_INITIALIZER;
extern set<string>setVisitedUrlMD5;
extern map<unsigned long,unsignedlong> mapIpBlock;
typedefmap<string,string>::value_typevalTypeCHL;

//定义连接类型结构

struct scheme_data
{
char *leading_string;//连接头字符
int default_port;//默认端口
int enabled;//允许连接否
};

//所有连接情况的定义
static struct scheme_data supported_schemes[] =
{
{ "http://",DEFAULT_HTTP_PORT, 1 },
{ "ftp://",DEFAULT_FTP_PORT, 1},


{NULL,-1,0 }
};


//分析并填充连接类型

void CUrl::ParseScheme (const char *url)
{
int i;

for (i = 0;supported_schemes[i].leading_string; i++)

if (0 ==strncasecmp (url, supported_schemes[i].leading_string,
strlen (supported_schemes[i].leading_string))) {

 //判断连接类型,并更新m_eScheme成员

  if (supported_schemes[i].enabled){
this->m_eScheme= (enum url_scheme) i;
return;
}else{
this->m_eScheme= SCHEME_INVALID;
return;
}
}

this->m_eScheme= SCHEME_INVALID;
return;
}


bool CUrl::ParseUrlEx(string strUrl)
{
char protocol[10];
char host[HOST_LEN];
char request[256];
int port = -1;

//初始化相关填充区域

memset( protocol, 0,sizeof(protocol) );
memset( host, 0, sizeof(host) );
memset( request, 0, sizeof(request) );

//分析并填充连接类型

this->ParseScheme(strUrl.c_str());

//如果非HTTP类型,返回FALSE
if( this->m_eScheme != SCHEME_HTTP){
return false;
}

//如果是HTTP类型,ParseUrlEx继续处理,注意每个参数都有传送它的长度

ParseUrlEx(strUrl.c_str(),
protocol,sizeof(protocol),
host,sizeof(host),
request,sizeof(request),
&port);

//将取出后的URL的相关信息写入成员变量中

m_sUrl =strUrl;
m_sHost = host;
m_sPath = request;

if( port > 0){
m_nPort = port;
}

return true;
}


//对URL进一步处理

void CUrl::ParseUrlEx(const char *url,
char *protocol, intlprotocol,
char *host, int lhost,
char *request, intlrequest,
int *port)
{
char *work,*ptr,*ptr2;

*protocol = *host = *request =0;
*port = 80;

//准备一个临时缓冲区WORK

int len = strlen(url);
//pthread_mutex_lock(&mutexMemory);
work = new char[len + 1];
//pthread_mutex_unlock(&mutexMemory);
memset(work, 0, len+1);
strncpy(work, url, len);

//依据":"找到协议,如果URL内有协议类型,则写入protocol,否则默认为HTTP

// find protocol if any
ptr = strchr(work, ':');
if( ptr != NULL ){
*(ptr++) = 0;//以免lprotocol比work中的协议字符的长度还要长
strncpy( protocol, work,lprotocol );
} else

{//有些URL省略了HTTP的前缀
strncpy( protocol, "HTTP",lprotocol );
ptr = work;
}

//跳过"//"

// skip past opening /'s
if( (*ptr=='/')&& (*(ptr+1)=='/') )
ptr+=2;

//使用ptr2处理主机

// find host
ptr2 = ptr;
while( IsValidHostChar(*ptr2)&& *ptr2 )
ptr2++;
*ptr2 = 0;//保证复制主机字符的正确性和安全性
strncpy( host, ptr, lhost );

//处理请求部分的字符

// find the request
int offset = ptr2 - work;
const char *pStr = url + offset;

//PSTR指向请求部分的起始部分通过offset偏移地址
strncpy( request, pStr, lrequest );

//处理端口号

// find the port number, ifany
ptr = strchr( host, ':' );
if( ptr != NULL ){
*ptr = 0;
*port = atoi(ptr+1);
}

//pthread_mutex_lock(&mutexMemory);
delete [] work;
//pthread_mutex_unlock(&mutexMemory);
work = NULL;
}


CUrl::CUrl()
{

//不带参数的构造函数初始化类成员
this->m_sUrl = "";
this->m_eScheme=SCHEME_INVALID;

this->m_sHost ="";
this->m_nPort =DEFAULT_HTTP_PORT;

this->m_sPath = "";

}

CUrl::~CUrl()
{

}


//通过主机字符得到IP地址
char * CUrl::GetIpByHost(const char *host)
{
if( !host ){// nullpointer
return NULL;
}

if( !IsValidHost(host) ){//invalid host
return NULL;
}

unsigned long inaddr = 0;
char *result = NULL;
int len = 0;

//主机转32位二进制数的IP地址
inaddr = (unsigned long)inet_addr( host );
//if ( (int)inaddr != -1){
if ( inaddr != INADDR_NONE){ // host is justip

//===主机名如果为XX.XX.XX.XX形式的IP地址字符,则inet_addr执行成功
len = strlen(host);
//pthread_mutex_lock(&mutexMemory);
result = new char[len+1];
//pthread_mutex_unlock(&mutexMemory);
memset(result, 0, len+1);
memcpy(result, host, len);

return result;

}

else

{
//firt find from cache
//==主机名字符为非IP形式

//Cache暂存的内存以主机名为索引查找IP地址
map<string,string>::iteratorit = mapCacheHostLookup.find(host);

if( it !=mapCacheHostLookup.end() ){// find in host lookupcache
const char *strHostIp;

strHostIp= (*it).second.c_str();

inaddr =(unsigned long)inet_addr( strHostIp );
//if ((int)inaddr != -1){
if ( inaddr!= INADDR_NONE ){

//成功在Cache中找到主机名对应的IP地址
len= strlen(strHostIp);
//pthread_mutex_lock(&mutexMemory);
result= new char[len+1];
//pthread_mutex_unlock(&mutexMemory);
memset(result, 0, len+1 );
memcpy(result, strHostIp, len );

//cout<< ":)" ;

returnresult;
}
}
}

//均未找到,只能求帮助于DNS服务

// if still not find, then try by DNSserver
struct hostent *hp;
hp =gethostbyname(host);//通过主机名获得IP地址
if(hp == NULL) {
//cout<< "gethostbyname() error inGetIpByHost: " << host<< endl;
return NULL;
}

// cache host lookup

//in为32位的IP地址结构变量
struct in_addr in;

bcopy(*(hp->h_addr_list),(caddr_t)&in, hp->h_length);
//inet_ntoa()是对structin_addr*结构转变成可以答应的ip的点进字

//符串,inet_ntop也是同样的功能,不过inet_ntop是使用于ipv4,ipv6

//而inet_ntoa只能用于ipv4


//AF_INET表示为IP地址形式即INTERNET地址家族

charabuf[INET_ADDRSTRLEN];

//将IN中的地址转成带点的IP地址字符形式
if( inet_ntop(AF_INET, (void *)&in,abuf,sizeof(abuf)) == NULL ){
cout<< "inet_ntop() return error inGetIpByHost" << endl;
return NULL;

} else {

//可以成功转化成字符形式的IP写入abuf

pthread_mutex_lock(&mutexCacheHost);
//if(mapCacheHostLookup.count(host) == 0){
//更新CACHE中的主机和IP地址对

if( mapCacheHostLookup.find(host) == mapCacheHostLookup.end()){


//cout<< endl<< host<< " and "<< abuf<< endl;
mapCacheHostLookup.insert(valTypeCHL ( host, abuf));
}
pthread_mutex_unlock(&mutexCacheHost);

}

// return result
len = strlen(abuf);
//pthread_mutex_lock(&mutexMemory);
result = new char[len + 1];
//pthread_mutex_unlock(&mutexMemory);
memset( result, 0, len+1 );
memcpy( result, abuf, len );

return result;
}


bool CUrl::IsValidHostChar(char ch)
{

//所有数字,字母,-,.,:,_为主机字符可接受部分
return( isalpha(ch) || isdigit(ch)
|| ch=='-' || ch=='.' ||ch==':' || ch=='_');
}


bool CUrl::IsValidHost(const char *host)
{
if( !host ){
return false;
}

if( strlen(host) < 6 ){ // incase host like "www", "pku", etc.
return false;
}

char ch;
for(unsigned int i=0;i<strlen(host); i++){
ch = *(host++);
if( !IsValidHostChar(ch)){
returnfalse;
}
}

return true;
}


bool CUrl::IsVisitedUrl(const char *url)
{
if( !url ){
return true; // if be null, wethink it have been visited
}

CMD5 iMD5;
iMD5.GenerateMD5( (unsigned char*)url,strlen(url) );
string strDigest = iMD5.ToString();

if( setVisitedUrlMD5.find(strDigest) !=setVisitedUrlMD5.end() ) {
return true;
} else {
return false;
}

}



bool CUrl::IsValidIp(const char *ip)
{
if( ip == NULL ){
return false;
}

unsigned long inaddr = (unsignedlong)inet_addr(ip);
if( inaddr == INADDR_NONE ){//invalid ip
return false;
}

if( mapIpBlock.size() > 0){
map<unsignedlong,unsigned long>::iterator pos;
for(pos=mapIpBlock.begin();pos!=mapIpBlock.end(); ++pos){
unsigned longret;

ret =inaddr & ~((*pos).second);
if( ret ==(*pos).first ){// inside
returntrue;
}
}

// outside
return false;
}

// if block range is not given, we think itinside also
return true;
}


bool CUrl::IsForeignHost(string host)
{
if( host.empty() ) return true;
if( host.size() > HOST_LEN )return true;

unsigned long inaddr = 0;

inaddr = (unsigned long)inet_addr(host.c_str() );
if ( inaddr != INADDR_NONE){ // host is justip
return false;
}

string::size_type idx = host.rfind('.');
string tmp;
if( idx != string::npos ){
tmp = host.substr(idx+1);
}

CStrFun::Str2Lower( tmp, tmp.size() );
const char *home_host[] ={
"cn","com","net","org","info",
"biz","tv","cc", "hk","tw"
};

int home_host_num = 10;

for(int i=0; i<home_host_num;i++){
if( tmp == home_host[i] )
returnfalse;
}

return true;
}


bool CUrl::IsImageUrl(string url)
{
if( url.empty() ) return false;
if( url.size() > HOST_LEN ) returnfalse;

string::size_type idx = url.rfind('.');
string tmp;
if( idx != string::npos ){
tmp = url.substr(idx+1);
}

CStrFun::Str2Lower( tmp, tmp.size() );
const char *image_type[] ={
"gif","jpg","jpeg","png","bmp",
"tif","psd"
};

int image_type_num = 7;

for (int i=0; i<image_type_num;i++)
{
if( tmp == image_type[i])
returntrue;
}

return false;
}

TSE(Tiny Search Engine) ======================= (Temporary) Web home: http://162.105.80.44/~yhf/Realcourse/ TSE is free utility for non-interactive download of files from the Web. It supports HTTP. According to query word or url, it retrieve results from crawled pages. It can follow links in HTML pages and create output files in Tianwang (http://e.pku.edu.cn/) format or ISAM format files. Additionally, it provies link structures which can be used to rebuild the web frame. --------------------------- Main functions in the TSE: 1) normal crawling, named SE, e.g: crawling all pages in PKU scope. and retrieve results from crawled pages according to query word or url, 2) crawling images and corresponding pages, named ImgSE. --------------------------- INSTALL: 1) execute "tar xvfz tse.XXX.gz" --------------------------- Before running the program, note Note: The program is default for normal crawling (SE). For ImgSE, you should: 1. change codes with the following requirements, 1) In "Page.cpp" file, find two same functions "CPage::IsFilterLink(string plink)" One is for ImgSE whose urls must include "tupian", "photo", "ttjstk", etc. the other is for normal crawling. For ImgSE, remember to comment the paragraph and choose right "CPage::IsFilterLink(string plink)". For SE, remember to open the paragraph and choose righ "CPage::IsFilterLink(string plink)". 2) In Http.cpp file i. find "if( iPage.m_sContentType.find("image") != string::npos )" Comment the right paragraph. 3) In Crawl.cpp file, i. "if( iPage.m_sContentType != "text/html" Comment the right paragraph. ii. find "if(file_length < 40)" Choose right one line. iii. find "iMD5.GenerateMD5( (unsigned char*)iPage.m_sContent.c_str(), iPage.m_sContent.length() )" Comment the right paragraph. iv. find "if (iUrl.IsImageUrl(strUrl))" Comment the right paragraph. 2.sh Clean; (Note not remove link4History.url, you should commnet "rm -f link4History.url" line first) secondly use "link4History.url" as a seed file. "link4History" is produced while normal crawling (SE). --------------------------- EXECUTION: execute "make clean; sh Clean;make". 1) for normal crawling and retrieving ./Tse -c tse_seed.img According to query word or url, retrieve results from crawled pages ./Tse -s 2) for ImgSE ./Tse -c tse_seed.img After moving Tianwang.raw.* data to secure place, execute ./Tse -c link4History.url --------------------------- Detail functions: 1) suporting multithreads crawling pages 2) persistent HTTP connection 3) DNS cache 4) IP block 5) filter unreachable hosts 6) parsing hyperlinks from crawled pages 7) recursively crawling pages h) Outputing Tianwang format or ISAM format files --------------------------- Files in the package Tse --- Tse execute file tse_unreachHost.list --- unreachable hosts according to PKU IP block tse_seed.pku --- PKU seeds tse_ipblock --- PKU IP block ... Directories in the package hlink,include,lib,stack,uri directories --- Parse links from a page --------------------------- Please report bugs in TSE to MAINTAINERS: YAN Hongfei * Created: YAN Hongfei, Network lab of Peking University. * Created: July 15 2003. version 0.1.1 * # Can crawl web pages with a process * Updated: Aug 20 2003. version 1.0.0 !!!! * # Can crawl web pages with multithreads * Updated: Nov 08 2003. version 1.0.1 * # more classes in the codes * Updated: Nov 16 2003. version 1.1.0 * # integrate a new version linkparser provided by XIE Han * # according to all MD5 values of pages content, * for all the pages not seen before, store a new page * Updated: Nov 21 2003. version 1.1.1 * # record all duplicate urls in terms of content MD5
<div class="product-item" data-category="novel"> <img src="https://ts1.tc.mm.bing.net/th/id/R-C.115539104325e27be2f0ee05ca4c080b?rik=xSg%2bwFfcC1%2b2QQ&riu=http%3a%2f%2fimages.bookuu.com%2fbook%2fC%2f01638%2f2634742-fm.jpg&ehk=H9B6G0WCUFgsQjHJE1iAtNWwpFnhmdAg30ByUapdJfY%3d&risl=&pid=ImgRaw&r=0" alt="产品2"> <h3>产品2:西游记</h3> <p>价格:$75</p> <button onclick="addToCart('产品2:西游记', 75, 'https://ts1.tc.mm.bing.net/th/id/R-C.115539104325e27be2f0ee05ca4c080b?rik=xSg%2bwFfcC1%2b2QQ&riu=http%3a%2f%2fimages.bookuu.com%2fbook%2fC%2f01638%2f2634742-fm.jpg&ehk=H9B6G0WCUFgsQjHJE1iAtNWwpFnhmdAg30ByUapdJfY%3d&risl=&pid=ImgRaw&r=0')">添加到购物车</button> </div> <div class="product-item" data-category="novel"> <img src="https://tse3-mm.cn.bing.net/th/id/OIP-C.3JrruqZy-PznYJxAHsoUdAHaKe?rs=1&pid=ImgDetMain" alt="产品3"> <h3>产品3:水浒传</h3> <p>价格:$100</p> <button onclick="addToCart('产品3:水浒传', 100, 'https://tse3-mm.cn.bing.net/th/id/OIP-C.3JrruqZy-PznYJxAHsoUdAHaKe?rs=1&pid=ImgDetMain')">添加到购物车</button> </div> <div class="product-item" data-category="novel"> <img src="https://ts1.tc.mm.bing.net/th/id/R-C.741a4e06ace8ed3a2a47d2a7fc53efa6?rik=OTex%2bIz3NhB3vw&riu=http%3a%2f%2fsdwypress.com%2fuploads%2fimage%2f2106%2f24%2f210624143125esxhc.jpg&ehk=bJB%2f6p6aW7wkHHbCiUTcnRf%2bNLwOktGxYScy28rgjwg%3d&risl=&pid=ImgRaw&r=0" alt="产品4"> <h3>产品4:三国演义</h3> <p>价格:$55</p> <button onclick="addToCart('产品4:三国演义', 55, 'https://ts1.tc.mm.bing.net/th/id/R-C.741a4e06ace8ed3a2a47d2a7fc53efa6?rik=OTex%2bIz3NhB3vw&riu=http%3a%2f%2fsdwypress.com%2fuploads%2fimage%2f2106%2f24%2f210624143125esxhc.jpg&ehk=bJB%2f6p6aW7wkHHbCiUTcnRf%2bNLwOktGxYScy28rgjwg%3d&risl=&pid=ImgRaw&r=0')">添加到购物车</button> </div> <div class="product-item" data-category="novel"> <img src="https://tse1-mm.cn.bing.net/th/id/OIP-C.4tBovpeIT27lEOj0FuFT5QHaKu?rs=1&pid=ImgDetMain" alt="产品5"> <h3>产品5:红岩</h3> <p>价格:$65</p> <button onclick="addToCart('产品5:红岩', 65, 'https://tse1-mm.cn.bing.net/th/id/OIP-C.4tBovpeIT27lEOj0FuFT5QHaKu?rs=1&pid=ImgDetMain')">添加到购物车</button> </div> <div class="product-item" data-category="history"> <img src="https://tse4-mm.cn.bing.net/th/id/OIP-C.PgSOBNz_XC3Q2b_O9Nkh-wHaLS?rs=1&pid=ImgDetMain" alt="产品6"> <h3>产品6:开放中的变迁</h3> <p>价格:$50</p> <button onclick="addToCart('产品6:开放中的变迁', 50, 'https://tse4-mm.cn.bing.net/th/id/OIP-C.PgSOBNz_XC3Q2b_O9Nkh-wHaLS?rs=1&pid=ImgDetMain')">添加到购物车</button> </div> <div class="product-item" data-category="history"> <img src="https://tse2-mm.cn.bing.net/th/id/OIP-C.1O2NDFn3NcQ5UtfKVUX9qwHaLD?rs=1&pid=ImgDetMain" alt="产品7"> <h3>产品7:中国近代史</h3> <p>价格:$55</p> <button onclick="addToCart('产品7:中国近代史', 55, 'https://tse2-mm.cn.bing.net/th/id/OIP-C.1O2NDFn3NcQ5UtfKVUX9qwHaLD?rs=1&pid=ImgDetMain')">添加到购物车</button> </div> <div class="product-item" data-category="history"> <img src="https://img3.doubanio.com/lpic/s27258223.jpg" alt="产品8"> <h3>产品8:国史大纲</h3> <p>价格:$65</p> <button onclick="addToCart('产品8:国史大纲', 65, 'https://img3.doubanio.com/lpic/s27258223.jpg')">添加到购物车</button> </div> <div class="product-item" data-category="science"> <img src="https://so1.360tres.com/t0151ae7fb7ec6d8870.jpg" alt="产品9"> <h3>产品9:时间简史</h3> <p>价格:$65</p> <button onclick="addToCart('产品9:时间简史', 65, 'https://so1.360tres.com/t0151ae7fb7ec6d8870.jpg')">添加到购物车</button> </div> <div class="product-item" data-category="science"> <img src="https://img.alicdn.com/bao/uploaded/i1/3715517208/O1CN01yttyGV237JTlFwGhn_!!0-item_pic.jpg" alt="产品10"> <h3>产品10:从一到无穷大:科学中的事实和臆测</h3> <p>价格:$65</p> <button onclick="addToCart('产品10:从一到无穷大:科学中的事实和臆测', 65, 'https://img.alicdn.com/bao/uploaded/i1/3715517208/O1CN01yttyGV237JTlFwGhn_!!0-item_pic.jpg')">添加到购物车</button> </div>把这些代码改成<div class="product-item" data-category="novel"> <a href="333.html?id=1"> <img src="https://tse4-mm.cn.bing.net/th/id/OIP-C.E1l9GHWnkbNLjcnJpqTpPAHaKC?rs=1&pid=ImgDetMain" alt="产品1"> <h3>产品1:红楼梦</h3> <p>价格:$50</p> </a> <button onclick="addToCart('产品1:红楼梦', 50, 'https://tse4-mm.cn.bing.net/th/id/OIP-C.E1l9GHWnkbNLjcnJpqTpPAHaKC?rs=1&pid=ImgDetMain')">添加到购物车</button> </div>
05-23
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值