在上一篇博文中,我们曾实现了一个简单的爬虫程序,在那篇博文中,我们使用的是单线程的,感觉多少有些不好,所以在这篇博文中,我们就将其修改为多线程的,废话不多说了,直接上代码,如下:
#ifndef __HTTP_CURL__H
#define __HTTP_CURL__H
#include <boost/smart_ptr.hpp>
#include <boost/thread/mutex.hpp>
#include <boost/thread/locks.hpp>
#include <boost/function.hpp>
#include <boost/bind.hpp>
#include <curl/curl.h>
#include <string>
#include <set>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
using namespace std;
using namespace boost;
#define MAX_BUFFERSIZE 1024*10
#define MAX_THREAD 10
static int fileIndex = 1;
boost::mutex mut;
std::set<string> urlSet;
std::set<string> finishUrlSet;
typedef set<string>::iterator urlSet_Iter;
#define BEGIN_SPIDER(path) {urlSet.insert(path);}
class HttpCurl
{
public:
HttpCurl()
{
conn = NULL;
}
~HttpCurl()
{
curl_easy_cleanup(conn);
}
static bool HttpCurlInit()
{
urlSet.clear();
finishUrlSet.clear();
CURLcode code;
code = curl_global_init(CURL_GLOBAL_DEFAULT);
if(CURLE_OK != code)
{
printf("Failed to global init default\n");
return false;
}
return true;
}
bool InitCurlObject(string& context)
{
CURLcode code;
conn = curl_easy_init();
if(NULL == conn)
{
printf("Failed to create CURL\n");
return false;
}
if(!setWriteFunc())
{
printf("Failed to set write\n");
return false;
}
if(!setWriteBuff(context))
{
printf("Failed to set buffer\n");
return false;
}
return true;
}
bool setWriteFunc()
{
CURLcode code;
code = curl_easy_setopt(conn,CURLOPT_WRITEFUNCTION,HttpCurl::write);
if(CURLE_OK != code)
{
printf("Failed to set write\n");
return false;
}
return true;
}
bool setWriteBuff(string& context)
{
CURLcode code;
code = curl_easy_setopt(conn,CURLOPT_WRITEDATA,&context);
if(CURLE_OK != code)
{
printf("Failed to set write data\n");
return false;
}
return true;
}
bool setUrl(string& url)
{
CURLcode code;
code = curl_easy_setopt(conn,CURLOPT_URL,url.c_str());
if(CURLE_OK != code)
{
printf("Failed to set URL\n");
return false;
}
return true;
}
bool getHttpResponse()
{
CURLcode code;
assert(conn);
code = curl_easy_perform(conn);
if(CURLE_OK != code)
{
printf("Failed to get response\n");
return false;
}
return true;
}
static long write(void* data,int size,int nmemb,string& context)
{
long sizes = size*nmemb;
std::string temp((char*)data,sizes);
context += temp;
return sizes;
}
bool save(const string& context,string filename)
{
CURLcode code;
int retcode = 0;
code = curl_easy_getinfo(conn,CURLINFO_RESPONSE_CODE,&retcode);
if((CURLE_OK == code)&& retcode ==200)
{
int length = strlen(context.c_str());
FILE* file = fopen(filename.c_str(),"w+");
fseek(file,0,SEEK_SET);
fwrite(context.c_str(),1,length,file);
fclose(file);
return true;
}
return false;
}
private:
CURL* conn;
};
class Spider
{
public:
Spider(shared_ptr<HttpCurl>& cul):httpCurl(cul)
{
httpCurlUrlSet.clear();
}
~Spider(){}
bool initCurl(string& context)
{
return httpCurl->InitCurlObject(context);
}
void parseUrl(const string& context)
{
const string tag = "href";
const string tag2 = "\"";
const string tag3 = "http";
string::size_type tempBegin,tempEnd,iter,httpIter;
tempBegin = tempEnd = 0;
iter= context.find(tag);
while(iter != string::npos)
{
tempBegin = context.find(tag2,iter);
if(tempBegin != string::npos)
{
++tempBegin;
tempEnd = context.find(tag2,tempBegin);
}
if(tempEnd != string::npos && tempEnd > tempBegin)
{
string url;
url.assign(context,tempBegin,(tempEnd-tempBegin));
httpIter = url.find(tag3);
if(httpIter != string::npos)
httpCurlUrlSet.insert(url);
}
iter = context.find(tag,tempEnd);
}
}
bool write(const string& context,const string& filename)
{
return httpCurl->save(context,filename);
}
void start(string url,string& context)
{
char filename[64];
memset(filename,0,sizeof(filename));
sprintf(filename,"%d.html",fileIndex++);
httpCurl->setUrl(url);
if(httpCurl->getHttpResponse())
{
parseUrl(context);
write(context,filename);
insertUrl();
}
}
void insertUrl()
{
boost::unique_lock<boost::mutex> lock(mut);
for( urlSet_Iter iter = httpCurlUrlSet.begin();iter != httpCurlUrlSet.end();++iter)
urlSet.insert(*iter);
httpCurlUrlSet.clear();
}
void displayUrl()
{
urlSet_Iter iter = urlSet.begin();
for(; iter != urlSet.end();++iter)
{
cout<<*iter<<endl;
}
}
string getUrl()
{
urlSet_Iter iter;
string url;
boost::unique_lock<boost::mutex> lock(mut);
for(iter = urlSet.begin();iter != urlSet.end();++iter)
{
if(finishUrlSet.find(*iter) != finishUrlSet.end())
continue;
break;
}
if(iter != urlSet.end())
{
url = *iter;
urlSet.erase(iter);
finishUrlSet.insert(url);
return url;
}
return "";
}
private:
shared_ptr<HttpCurl> httpCurl;
std::set<std::string> httpCurlUrlSet;
};
class SpiderThread
{
typedef boost::function<void()> func;
public:
SpiderThread(shared_ptr<Spider>& spider):spider(spider)
{
context.clear();
InitSpider(spider,context);
}
SpiderThread(shared_ptr<Spider>& spider,func& fun):spider(spider),fun(fun)
{
context.clear();
InitSpider(spider,context);
}
~SpiderThread()
{
stop();
}
void InitSpider(shared_ptr<Spider>& spider,string& context)
{
spider->initCurl(context);
}
static void* start(void* arg)
{
printf("start...\n");
SpiderThread* thread = static_cast<SpiderThread*>(arg);
thread->fun();
return NULL;
}
void setFunc(const func& fun)
{
this->fun = fun;
}
shared_ptr<Spider> getHttpCurl()
{
return spider;
}
string& getContext()
{
return context;
}
void run()
{
pthread_create(&pThread,NULL,&start,this);
}
void stop()
{
pthread_join(pThread,NULL);
}
private:
pthread_t pThread;
shared_ptr<Spider> spider;
func fun;
string context;
};
static void* loop(SpiderThread* spiderThread)
{
for(;;)
{
string url = spiderThread->getHttpCurl()->getUrl();
printf("url=%s\n",url.c_str());
if(url != "")
spiderThread->getHttpCurl()->start(url,spiderThread->getContext());
}
return NULL;
}
#endif
测试程序如下:
#include "curlTest.h"
int main()
{
HttpCurl::HttpCurlInit();
BEGIN_SPIDER("www.baidu.com");
shared_ptr<HttpCurl> curl1(new HttpCurl());
shared_ptr<Spider> spider1(new Spider(curl1));
spider1->displayUrl();
SpiderThread spiderThread(spider1);
boost::function<void()> f = boost::bind(&loop,&spiderThread);
spiderThread.setFunc(f);
spiderThread.run();
shared_ptr<HttpCurl> curl2(new HttpCurl());
shared_ptr<Spider>spider2(new Spider(curl2));
SpiderThread spiderThread1(spider2);
boost::function<void()> f1 = boost::bind(&loop,&spiderThread1);
spiderThread1.setFunc(f1);
spiderThread1.run();
//sleep(10);
return 0;
}
测试结果:
url=http://anquan.baidu.com/bbs/thread-10093-1-1.html
url=http://anquan.baidu.com/bbs/thread-10097-1-1.html
url=http://anquan.baidu.com/bbs/thread-10106-1-1.html
url=http://anquan.baidu.com/bbs/thread-10112-1-1.html
url=http://anquan.baidu.com/bbs/thread-10114-1-1.html
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&goto=findpost&pid=80417&ptid=10112
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&goto=findpost&pid=80421&ptid=10114
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&goto=findpost&pid=80450&ptid=10114
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&goto=findpost&pid=80458&ptid=10114
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&goto=findpost&pid=80466&ptid=10114
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&goto=findpost&pid=80475&ptid=10114
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&goto=findpost&pid=80843&ptid=10112
url=http://anquan.baidu.com/bbs/thread-10117-1-1.html
url=http://anquan.baidu.com/bbs/thread-10121-1-1.html
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&goto=findpost&pid=80404&ptid=10117
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&goto=findpost&pid=80424&ptid=10117
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&goto=findpost&pid=80427&ptid=10117
总结
这篇博文主要是针对上篇博文进行了修改,使其支持多线程,整个测试下来,感觉其速度明显要快于单线程,再设计的过程中主要是将之前的功能重新又封装了一层,并且在这层面上实现了简单的线程处理函数,这点也是参考了之前写的多线程池的思想,但是这个支持多线程的爬虫程序,同样有几点不足:1)使用了boost:::mutex进行同步化,这种方式最大的弊端就是在关键的节点上呈现出了单线程的特性,不利于提高整体的解析能力,2)在将爬到的结构写入文件时,没有考虑使用同步机制,主要是考虑到简洁性,部分文件名重叠的情形不影响整个爬虫解析能力,等有时间再将其改下,3)个人感觉完全可以将测试程序中的一些步骤进行抽象化为线程池的设计思想,这样使得测试程序更加的明晰易懂。总之,本程序只是个人业余时间写的程序,纯属个人娱乐,不足以和商业爬虫相比,里面有些编码不是很规范,请见谅,多谢了,本篇博文到此结束,谢谢
如果需要,请注明转载,多谢