开源项目(库)之libcurl学习(二)

本文介绍了一个改进的多线程网页爬虫程序,对比单线程爬虫,提高了解析速度。通过重新封装功能并实现简单线程处理,程序支持多线程解析网页。尽管存在使用boost::mutex导致的单线程瓶颈、文件写入未考虑同步机制等问题,作者提出后续优化方向。程序展示了多线程爬虫的基本实现,旨在提升网页内容抓取效率。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

在上一篇博文中,我们曾实现了一个简单的爬虫程序,在那篇博文中,我们使用的是单线程的,感觉多少有些不好,所以在这篇博文中,我们就将其修改为多线程的,废话不多说了,直接上代码,如下:

#ifndef __HTTP_CURL__H
#define __HTTP_CURL__H

#include <boost/smart_ptr.hpp>
#include <boost/thread/mutex.hpp>
#include <boost/thread/locks.hpp>
#include <boost/function.hpp>
#include <boost/bind.hpp>
#include <curl/curl.h>
#include <string>
#include <set>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
using namespace std;
using namespace boost;

#define MAX_BUFFERSIZE 1024*10
#define MAX_THREAD 10
static int fileIndex = 1;

boost::mutex mut;
std::set<string> urlSet;
std::set<string> finishUrlSet;
typedef set<string>::iterator urlSet_Iter;

#define BEGIN_SPIDER(path) {urlSet.insert(path);}
class HttpCurl
{
    public:
        HttpCurl()
        {
            conn = NULL;
        }
        ~HttpCurl()
        {
            curl_easy_cleanup(conn);
        }

        static bool HttpCurlInit()
        {
            urlSet.clear();
            finishUrlSet.clear();
            CURLcode code;
            code = curl_global_init(CURL_GLOBAL_DEFAULT);
            if(CURLE_OK != code)
            {
                printf("Failed to global init default\n");
                return false;
            }
            return true;
        }

        bool InitCurlObject(string& context)
        {
            CURLcode code;
            conn = curl_easy_init();
            if(NULL == conn)
            {
                printf("Failed to create CURL\n");
                return false;
            }
            if(!setWriteFunc())
            {
                printf("Failed to set write\n");
                return false;
            }
            if(!setWriteBuff(context))
            {
                printf("Failed to set buffer\n");
                return false;
            }
            return true;
        }

        bool setWriteFunc()
        {
            CURLcode code;
            code = curl_easy_setopt(conn,CURLOPT_WRITEFUNCTION,HttpCurl::write);
            if(CURLE_OK != code)
            {
                printf("Failed to set write\n");
                return false;
            }
            return true;
        }
        bool setWriteBuff(string& context)
        {
            CURLcode code;
            code = curl_easy_setopt(conn,CURLOPT_WRITEDATA,&context);
            if(CURLE_OK != code)
            {
                printf("Failed to set write data\n");
                return false;
            }
            return true;
        }

        bool setUrl(string& url)
        {
            CURLcode code;
            code = curl_easy_setopt(conn,CURLOPT_URL,url.c_str());
            if(CURLE_OK != code)
            {
                printf("Failed to set URL\n");
                return false;
            }
            return true;
        }

        bool getHttpResponse()
        {
            CURLcode code;
            assert(conn);
            code = curl_easy_perform(conn);
            if(CURLE_OK != code)
            {
                printf("Failed to get response\n");
                return false;
            }

            return true;
        }
        static long write(void* data,int size,int nmemb,string& context)
        {
            long sizes = size*nmemb;
            std::string temp((char*)data,sizes);
            context += temp;
            return sizes;
        }

        bool save(const string& context,string filename)
        {
            CURLcode code;
            int retcode = 0;
            code = curl_easy_getinfo(conn,CURLINFO_RESPONSE_CODE,&retcode);
            if((CURLE_OK == code)&& retcode ==200)
            {
                int length = strlen(context.c_str());
                FILE* file = fopen(filename.c_str(),"w+");
                fseek(file,0,SEEK_SET);
                fwrite(context.c_str(),1,length,file);
                fclose(file);
                return  true;
            }
            return false;
        }
    private:
        CURL* conn;
};

class Spider
{
    public:
        Spider(shared_ptr<HttpCurl>& cul):httpCurl(cul)
        {
            httpCurlUrlSet.clear();
        }
        ~Spider(){}

        bool initCurl(string& context)
        {
            return httpCurl->InitCurlObject(context);
        }

        void parseUrl(const string& context)
        {
            const string tag = "href";
            const string tag2 = "\"";
            const string tag3 = "http";
            string::size_type tempBegin,tempEnd,iter,httpIter;
            tempBegin = tempEnd = 0;
            iter= context.find(tag);
            while(iter != string::npos)
            {
                tempBegin = context.find(tag2,iter);
                if(tempBegin != string::npos)
                {
                    ++tempBegin;
                    tempEnd = context.find(tag2,tempBegin);
                }
                if(tempEnd != string::npos && tempEnd > tempBegin)
                {
                    string url;
                    url.assign(context,tempBegin,(tempEnd-tempBegin));
                    httpIter = url.find(tag3);
                    if(httpIter != string::npos)
                        httpCurlUrlSet.insert(url);
                }
                iter = context.find(tag,tempEnd);
            }
        }
        bool write(const string& context,const string& filename)
        {
            return httpCurl->save(context,filename);
        }

        void start(string url,string& context)
        {
            char filename[64];
            memset(filename,0,sizeof(filename));
            sprintf(filename,"%d.html",fileIndex++);

            httpCurl->setUrl(url);
            if(httpCurl->getHttpResponse())
            {
               parseUrl(context);
               write(context,filename);
               insertUrl();
            }
        }

        void insertUrl()
        {
            boost::unique_lock<boost::mutex> lock(mut);
            for( urlSet_Iter iter = httpCurlUrlSet.begin();iter != httpCurlUrlSet.end();++iter)
                urlSet.insert(*iter);
            httpCurlUrlSet.clear();
        }

        void displayUrl()
        {
            urlSet_Iter iter = urlSet.begin();
            for(; iter != urlSet.end();++iter)
            {
                cout<<*iter<<endl;
            }
        }
        string getUrl()
        {
            urlSet_Iter iter;
            string url;
            boost::unique_lock<boost::mutex> lock(mut);
            for(iter = urlSet.begin();iter != urlSet.end();++iter)
            {
                if(finishUrlSet.find(*iter) != finishUrlSet.end())
                    continue;
                break;
            }
            if(iter != urlSet.end())
            {
                url = *iter;
                urlSet.erase(iter);
                finishUrlSet.insert(url);
                return url;
            }
            return "";
        }

    private:
        shared_ptr<HttpCurl> httpCurl;
        std::set<std::string> httpCurlUrlSet;

};

class SpiderThread
{
    typedef boost::function<void()> func;
    public:
        SpiderThread(shared_ptr<Spider>& spider):spider(spider)
        {
            context.clear();
            InitSpider(spider,context);
        }
        SpiderThread(shared_ptr<Spider>& spider,func& fun):spider(spider),fun(fun)
        {
            context.clear();
            InitSpider(spider,context);
        }
        ~SpiderThread()
        {
            stop();
        }

        void InitSpider(shared_ptr<Spider>& spider,string& context)
        {
             spider->initCurl(context);
        }

        static void* start(void* arg)
        {
            printf("start...\n");
            SpiderThread* thread = static_cast<SpiderThread*>(arg);
            thread->fun();
            return NULL;
        }

        void setFunc(const func& fun)
        {
            this->fun = fun;
        }

        shared_ptr<Spider> getHttpCurl()
        {
            return spider;
        }
        string& getContext()
        {
            return context;
        }
        void run()
        {
            pthread_create(&pThread,NULL,&start,this);
        }
        void stop()
        {
            pthread_join(pThread,NULL);
        }

    private:
        pthread_t pThread;
        shared_ptr<Spider> spider;
        func fun;
        string context;
};

static void* loop(SpiderThread* spiderThread)
{
    for(;;)
    {
        string url = spiderThread->getHttpCurl()->getUrl();
        printf("url=%s\n",url.c_str());
        if(url != "")
            spiderThread->getHttpCurl()->start(url,spiderThread->getContext());
    }
    return NULL;
}

#endif

测试程序如下:

#include "curlTest.h"

int main()
{
    HttpCurl::HttpCurlInit();
    BEGIN_SPIDER("www.baidu.com");

    shared_ptr<HttpCurl> curl1(new HttpCurl());
    shared_ptr<Spider> spider1(new Spider(curl1));
    spider1->displayUrl();
    SpiderThread spiderThread(spider1);
    boost::function<void()> f = boost::bind(&loop,&spiderThread);
    spiderThread.setFunc(f);
    spiderThread.run();

    shared_ptr<HttpCurl> curl2(new HttpCurl());
    shared_ptr<Spider>spider2(new Spider(curl2));
    SpiderThread spiderThread1(spider2);
    boost::function<void()> f1 = boost::bind(&loop,&spiderThread1);
    spiderThread1.setFunc(f1);
    spiderThread1.run();
    //sleep(10);
    return 0;
}

测试结果:

url=http://anquan.baidu.com/bbs/thread-10093-1-1.html
url=http://anquan.baidu.com/bbs/thread-10097-1-1.html
url=http://anquan.baidu.com/bbs/thread-10106-1-1.html
url=http://anquan.baidu.com/bbs/thread-10112-1-1.html
url=http://anquan.baidu.com/bbs/thread-10114-1-1.html
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&amp;goto=findpost&amp;pid=80417&amp;ptid=10112
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&amp;goto=findpost&amp;pid=80421&amp;ptid=10114
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&amp;goto=findpost&amp;pid=80450&amp;ptid=10114
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&amp;goto=findpost&amp;pid=80458&amp;ptid=10114
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&amp;goto=findpost&amp;pid=80466&amp;ptid=10114
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&amp;goto=findpost&amp;pid=80475&amp;ptid=10114
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&amp;goto=findpost&amp;pid=80843&amp;ptid=10112
url=http://anquan.baidu.com/bbs/thread-10117-1-1.html
url=http://anquan.baidu.com/bbs/thread-10121-1-1.html
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&amp;goto=findpost&amp;pid=80404&amp;ptid=10117
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&amp;goto=findpost&amp;pid=80424&amp;ptid=10117
url=http://anquan.baidu.com/bbs/forum.php?mod=redirect&amp;goto=findpost&amp;pid=80427&amp;ptid=10117

总结

       这篇博文主要是针对上篇博文进行了修改,使其支持多线程,整个测试下来,感觉其速度明显要快于单线程,再设计的过程中主要是将之前的功能重新又封装了一层,并且在这层面上实现了简单的线程处理函数,这点也是参考了之前写的多线程池的思想,但是这个支持多线程的爬虫程序,同样有几点不足:1)使用了boost:::mutex进行同步化,这种方式最大的弊端就是在关键的节点上呈现出了单线程的特性,不利于提高整体的解析能力,2)在将爬到的结构写入文件时,没有考虑使用同步机制,主要是考虑到简洁性,部分文件名重叠的情形不影响整个爬虫解析能力,等有时间再将其改下,3)个人感觉完全可以将测试程序中的一些步骤进行抽象化为线程池的设计思想,这样使得测试程序更加的明晰易懂。总之,本程序只是个人业余时间写的程序,纯属个人娱乐,不足以和商业爬虫相比,里面有些编码不是很规范,请见谅,多谢了,本篇博文到此结束,谢谢

如果需要,请注明转载,多谢

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值