程序整体思路:
给出种子文件,解析种子文件,然后得出相应网址,加入队列,采样广度优先,在之前出现过问题,经过一一修复之后,没有问题,经过我一天的测试,验证。使用内存大约几兆左右。
#include <iostream>
#include <string>
#include <vector>
#include <fstream>
#include <hash_set>
#include <queue>
#include <WinSock2.h>
#pragma comment(lib,"ws2_32.lib")
using namespace std;
queue<string> URL;
hash_set<string> visitedurl;
hash_set<string> visitedimg;
int g_ImgCnt = 1;
#define DEFAULT_PAGE_BUF_SIZE 1048576;//默认页面大小
bool ParseURL(const string &url, string &host, string &resource)
{
size_t found = url.find("http://");
if(found == string::npos)
return false;
found += strlen("http://");
size_t found1 = url.find_first_of('/',found);
if(found1 == string::npos)
return false;
host = url.substr(found,found1 - found);
resource = url.substr(found1, url.size() - found1);
return true;
}
bool gethttpresponse(const string &host,const string &resource, string &response,int &bytes)
{
struct hostent *hp = gethostbyname(host.c_str());
if(hp == NULL){
cout<<"不能解析出主机地址!"<<endl;
return false;
}
SOCKET sock = socket(AF_INET,SOCK_STREAM,IPPROTO_TCP);
int nNetTimeout = 1000;
setsockopt(sock,SOL_SOCKET,SO_RCVTIMEO,(char *)&nNetTimeout,sizeof(int));
if(sock == -1 || sock == -2){
cout<<"不能创建socket!"<<endl;
return false;
}
//建立服务器地址
SOCKADDR_IN sa;
sa.sin_family = AF_INET;
sa.sin_port = htons(80);
memcpy(&sa.sin_addr,hp->h_addr,4);
//连接服务器
if(connect(sock,(SOCKADDR*)&sa,sizeof(sa)) != 0){
cout<<"不能连接服务器!"<<endl;
return false;
}
//准备发送数据
string request = "GET " + resource + " HTTP/1.1\r\nHost:" + host + "\r\nConnection:Close\r\n\r\n";
if(SOCKET_ERROR == send(sock,request.c_str(),request.size(),0)){
cout<<"发送数据错误!"<<endl;
return false;
}
//接收数据
int m_page_bufsize = DEFAULT_PAGE_BUF_SIZE;
char * buf = new char[m_page_bufsize];
memset(buf,0,m_page_bufsize);
int bytesread = 0;
int ret = 1;
cout<<"读取:";
while(ret > 0){
ret = recv(sock,buf + bytesread, m_page_bufsize - bytesread, 0);
if(ret > 0){
bytesread += ret;
}
if(m_page_bufsize - bytesread < 100){
cout<<endl<<"重新分配空间!"<<endl;
char * mbuf = new char[2 * m_page_bufsize];
strcpy(mbuf,buf);
delete [] buf;
buf = mbuf;
}
cout<<ret<<" ";
}
cout<<endl;
buf[bytesread] = '\0';
response.assign(buf,bytesread);
delete [] buf;
bytes = bytesread;
closesocket(sock);
return true;
}
bool ParseHtml(const string& response, vector<string> &imgurls)
{
string url;
string http = "href=\"http://";
size_t found = response.find(http);
ofstream ofile("url.txt",ios::app);
while(found != string::npos){
found += strlen("href=\"");
SIZE_T found1 = response.find('"',found+1);
string tmpurl = response.substr(found,found1 - found);
if(visitedurl.find(tmpurl) == visitedurl.end()){
visitedurl.insert(visitedurl.end(),tmpurl);
if(visitedurl.size() > 100000)
visitedurl.clear();
ofile<<tmpurl<<endl;
URL.push(tmpurl);
}
found = response.find(http,found1);
}
ofile.close();
string img = "http://";
found = response.find(img);
while(found != string::npos){
SIZE_T found1 = response.find('"',found + 1);
if(found1 == string::npos)
return true;
string imgurl = response.substr(found,found1 - found);
SIZE_T found2 = imgurl.find_last_of('.');
if(found2 == string::npos)
return true;
string ext = imgurl.substr(found2 + 1, imgurl.size() - found2 -1);
if(ext.compare("jpg") && ext.compare("jpeg") && ext.compare("png") && ext.compare("gif") && ext.compare("bmp")){
found = response.find(img,found1 + imgurl.size());
continue;
}
if(visitedimg.find(imgurl) == visitedimg.end()){
visitedimg.insert(visitedimg.end(),imgurl);
if(visitedimg.size()>100000)
visitedimg.clear();
imgurls.push_back(imgurl);
}
found = response.find(img,found1 + imgurl.size());
}
cout<<"结束解析这个网页"<<endl;
return true;
}
bool Tofilename(const string url, string &filename)
{
int size = url.size();
for(int i = 0; i < size; i++){
if(url[i] != '*' && url[i] != '\\'&& url[i] != '/'
&& url[i] != ':'&& url[i] != '?'&& url[i] != '<'
&& url[i] != '>'&& url[i] != '|'&& url[i] != '"'
&& url[i] != '.' && url[i] != '-' && url[i] != ' ' ){
filename += url[i];
}
}
filename += ".txt";
return true;
}
void Downloads(const vector<string> &imgurls, const string &url)
{
int size = imgurls.size();
string filename;
if(Tofilename(url,filename) == 0){
cout<<"转换名字错误"<<endl;
return;
}
filename = "./img";// + filename;
/*if(!CreateDirectory( filename.c_str(),NULL )){
cout << "创建文件错误!"<< filename<<endl;
return;
}*/
for(int i = 0; i < size; i++){
string str = imgurls[i];
SIZE_T found = str.find_last_of('.');
string ext = imgurls[i].substr(found + 1, str.size() - found -1);
if(ext.compare("jpg") && ext.compare("jpeg") && ext.compare("png") && ext.compare("gif") && ext.compare("bmp"))
continue;
string host;
string resource;
if(!ParseURL(imgurls[i],host,resource)){
cout<<"网址错误!"<<endl;
return;
}
string image;
int bytes = 0;
if(gethttpresponse(host,resource,image,bytes)){
if(image.size() == 0){
cout<<"传回数据错误!"<<endl;
continue;
}
size_t found = image.find("\r\n\r\n");
if(found == string::npos){
cout<<"传回数据错误!"<<endl;
continue;
}
found += strlen("\r\n\r\n");
if(found == bytes){
cout<<"传回数据错误!"<<endl;
continue;
}
int index = imgurls[i].find_last_of("/");
if( index!=string::npos ){
string imgname = imgurls[i].substr( index , imgurls[i].size() - index );
ofstream ofile( filename+imgname, ios::binary );
if( !ofile.is_open() )
continue;
cout <<g_ImgCnt++<< filename+imgname<<endl;
ofile.write(&image[found], bytes - found -1);
ofile.close();
}
}
}
}
//广度优先
void BFS(string url)
{
string host;
string resource;
if(!ParseURL(url,host,resource)){
cout<<"网址错误!"<<endl;
return;
}
string response = "";
int bytes = 0;
if(!gethttpresponse(host,resource,response,bytes)){
cout<<"没有得到网页响应!"<<endl;
return;
}
//解析网页中所有的URL地址
if(response.size() == 0){
cout<<"服务器返回数据错误!"<<endl;
return;
}
//存储网页返回信息
string filename;
if(Tofilename(url,filename) == 0){
cout<<"转换名字错误!"<<endl;
return;
}
ofstream ofile("./html/"+ filename);
if(ofile.is_open()){
ofile<<response<<endl;
ofile.close();
}
else{
cout<<"打开文件错误!"<<endl;
return;
}
vector<string> imgurls;
if(ParseHtml(response,imgurls) == 0){
cout<<"解析网页错误"<<endl;
return;
}
Downloads(imgurls,url);
}
int main()
{
WSADATA wsadata;
if(WSAStartup(MAKEWORD(2,2),&wsadata) != 0){
cout<<"加载winsockt动态库失败"<<endl;
return 1;
}
CreateDirectory("./img",0);
CreateDirectory("./html",0);
//读取种子文件URL
string urlstart;
ifstream ifile("HrefURL.txt");
if(!ifile.is_open()){
cout<<"打开文件错误!"<<endl;
return 1;
}
while(getline(ifile,urlstart)){
URL.push(urlstart);
visitedurl.insert(urlstart);
}
ifile.close();
while(URL.size() > 0){
string str = URL.front();
cout<<str<<endl;
//广度优先,寻找种子文件里面所有URL
BFS(str);
URL.pop();
cout<<"URL数目:"<<URL.size()<<endl;
if(g_ImgCnt > 200000)
break;
}
WSACleanup();
return 0;
}
转载:请标明来处http://blog.youkuaiyun.com/ranyongqing/article/details/26603891