c 手写底层爬虫知识面比较综合不是很喜欢写博客望大家喜欢

最新推荐文章于 2022-09-21 19:59:13 发布

阿翔同学

最新推荐文章于 2022-09-21 19:59:13 发布

阅读量184

点赞数 1

CC 4.0 BY-SA版权

本文链接：https://blog.youkuaiyun.com/qq_44065088/article/details/97432459

#define _SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS
#define _WINSOCK_DEPRECATED_NO_WARNINGS
#define _CRT_SECURE_NO_WARNINGS

#include <string>
#include <iostream>
#include <fstream>
#include <vector>
#include "winsock2.h"
#include <time.h>
#include <queue>
#include <hash_set>

#define _SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS 
#define _WINSOCK_DEPRECATED_NO_WARNINGS 
#define _CRT_SECURE_NO_WARNINGS   

#include <string>
#include <iostream>
#include <fstream>
#include <vector>
#include "winsock2.h"
#include <time.h>
#include <queue>
#include <hash_set>

#define MAX_URL_LEN  2083   //最大的url 的长度
#define HOST_NAME    256

#pragma comment(lib, "ws2_32.lib")
using namespace std;

#define DEFAULT_PAGE_BUF_SIZE 1048576

queue<string> hrefUrl;
hash_set<string> visitedUrl;
hash_set<string> visitedImg;
//int g_depth = 0;
int g_ImgCnt = 1;

/*************************************
* 功能描述：对URL进行解析，得到主机名，资源名
*
* 返回值： 成功 - ture , 失败 - false
*************************************/

bool ParseURL(const string & url, string & host, string & resource) {
    char pHost[HOST_NAME];
    char pResource[MAX_URL_LEN];
    const char * pos = NULL;

    if (strlen(url.c_str()) > MAX_URL_LEN) {
        return false;
    }

    //定位主机名称位置
    pos = strstr(url.c_str(), "http://");

    if (pos == NULL) {
        pos = url.c_str();
    }
    else {
        pos += strlen("http://");
    }

    if (strstr(pos, "/") == 0) {
        return false;
    }

    sscanf_s(pos, "%[^/]%s", pHost, HOST_NAME, pResource, MAX_URL_LEN);
    host = pHost;
    resource = pResource;
    return true;
}

/********************************************
*功能：发送http Get请求到服务器，获取响应页面
*返回值： 成功 - ture , 失败 - false
********************************************/
bool GetHttpResponse(const string &url, char * &response, int &bytesRead) {
    string host, resource;
    if (!ParseURL(url, host, resource)) {
        cout << "Can not parse the url" << endl;
        return false;
    }

    //解析域名，获取域名对应的ip地址
    struct hostent * hp = gethostbyname(host.c_str());
    if (hp == NULL) {
        cout << "Can not find host address" << endl;
        return false;
    }


    //创建套接字
    SOCKET sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
    if (sock == -1 || sock == -2) {
        cout << "Can not create sock." << endl;
        return false;
    }

    //建立服务器地址
    SOCKADDR_IN sa;
    sa.sin_family = AF_INET;
    sa.sin_port = htons(80);
    memcpy(&sa.sin_addr, hp->h_addr, 4);

    //建立连接
    if (0 != connect(sock, (SOCKADDR*)&sa, sizeof(sa))) {
        cout << "Can not connect to : " << url << endl;
        closesocket(sock);
        return false;
    };

    //准备发送http 请求数据
    string request = "GET " + resource + " HTTP/1.1\r\nHost:" + host + "\r\nConnection:Close\r\n\r\n";

    //发送数据
    if (SOCKET_ERROR == send(sock, request.c_str(), request.size(), 0)) {
        cout << "send error" << endl;
        closesocket(sock);
        return false;
    }

    //接收数据
    int contentLength = DEFAULT_PAGE_BUF_SIZE;
    char *pageBuf = (char *)malloc(contentLength);
    memset(pageBuf, 0, contentLength);

    bytesRead = 0;
    int ret = 1;
    cout << "Read: ";
    while (ret > 0) {
        ret = recv(sock, pageBuf + bytesRead, contentLength - bytesRead, 0);

        if (ret > 0)
        {
            bytesRead += ret;
        }

        if (contentLength - bytesRead<1024) {
            cout << "\nRealloc memorry" << endl;
            contentLength *= 2;
            pageBuf = (char*)realloc(pageBuf, contentLength);       //重新分配内存
        }
        cout << " read: " << ret << " ";
    }
    cout << endl;

    pageBuf[bytesRead] = '\0';
    response = pageBuf;
    closesocket(sock);
    return true;

}

//
/********************************************
*功能：  提取所有的网页URL以及图片对应的URL
*返回值： 成功 - ture , 失败 - false
********************************************/
void HTMLParse(string & htmlResponse, vector<string> & imgurls, const string & host) {
    //找所有连接，加入queue中
    const char *p = htmlResponse.c_str();
    char *tag = "href=\"";
    const char *pos = strstr(p, tag);
    ofstream ofile("url.txt", ios::app);
    while (pos) {
        pos += strlen(tag);
        const char * nextQ = strstr(pos, "\"");
        if (nextQ) {
            char * url = new char[nextQ - pos + 1];
            //char url[100]; //固定大小的会发生缓冲区溢出的危险
            sscanf(pos, "%[^\"]", url);
            string surl = url;  // 转换成string类型，可以自动释放内存
            if (visitedUrl.find(surl) == visitedUrl.end()) {
                visitedUrl.insert(surl);
                ofile << surl << endl;
                hrefUrl.push(surl);
            }
            pos = strstr(pos, tag);
            delete[] url;  // 释放掉申请的内存
        }
    }
    ofile << endl << endl;
    ofile.close();

    tag = "<img ";
    const char* att1 = "src=\"";
    const char* att2 = "lazy-src=\"";
    const char *pos0 = strstr(p, tag);
    while (pos0) {
        pos0 += strlen(tag);
        const char* pos2 = strstr(pos0, att2);
        if (!pos2 || pos2 > strstr(pos0, ">")) {
            pos = strstr(pos0, att1);
            if (!pos) {
                pos0 = strstr(att1, tag);
                continue;
            }
            else {
                pos = pos + strlen(att1);
            }
        }
        else {
            pos = pos2 + strlen(att2);
        }

        const char * nextQ = strstr(pos, "\"");
        if (nextQ) {
            char * url = new char[nextQ - pos + 1];
            sscanf(pos, "%[^\"]", url);
            cout << url << endl;
            string imgUrl = url;
            if (visitedImg.find(imgUrl) == visitedImg.end()) {
                visitedImg.insert(imgUrl);
                imgurls.push_back(imgUrl);
            }
            pos0 = strstr(pos0, tag);
            delete[] url;
        }
    }
    cout << "end of Parse this html" << endl;
}

/*********************************************
*将URL转化为文件名,转换特殊字符避免保存失败
*参数：
*      url - 待转换的url路径
*返回值：
*      转换后的字符串
*********************************************/
string UrlToFileName(const string &url) {
    string fileName;
    fileName.resize(url.size());
    int k = 0;
    for (int i = 0; i<(int)url.size(); i++) {
        char ch = url[i];
        if (ch != '\\'&&ch != '/'&&ch != ':'&&ch != '*'&&ch != '?'&&ch != '"'&&ch != '<'&&ch != '>'&&ch != '|') {
            fileName[k++] = ch;
        }
        else {
            fileName[k++] = '-';
        }
    }
    return fileName.substr(0, k) + ".txt";
}

/*********************************************
*下载图片列表并保存到img文件夹
*参数：
*      url - 图片对应的路径，接下来会转换成保存图片的文件夹的名字
*      immgurls - 待下载的图片列表
*返回值：
*      无
*********************************************/
void DownLoadImg(vector<string> & imgurls, const string &url) {

    //生成保存该url下图片的文件夹
    string foldname = UrlToFileName(url);
    foldname = "./img/" + foldname;
    if (!CreateDirectory(foldname.c_str(), NULL))
        cout << "Can not create directory:" << foldname << endl;
    char *image;
    int byteRead;
    for (unsigned int i = 0; i<imgurls.size(); i++) {
        //判断是否为图片，bmp，jgp，jpeg，gif 
        string str = imgurls[i];
        int pos = str.find_last_of(".");
        if (pos == string::npos)
            continue;
        else {
            string ext = str.substr(pos + 1, str.size() - pos - 1);
            if (ext != "bmp"&& ext != "jpg" && ext != "jpeg"&& ext != "gif"&&ext != "png")
                continue;
        }
        //下载其中的内容
        if (GetHttpResponse(imgurls[i], image, byteRead)) {
            if (strlen(image) == 0) {
                continue;
            }
            const char *p = image;
            const char * pos = strstr(p, "\r\n\r\n") + strlen("\r\n\r\n");
            int index = imgurls[i].find_last_of("/");
            if (index != string::npos) {
                string imgname = imgurls[i].substr(index, imgurls[i].size());
                ofstream ofile(foldname + imgname, ios::binary);
                if (!ofile.is_open())
                    continue;
                cout << g_ImgCnt++ << foldname + imgname << endl;
                ofile.write(pos, byteRead - (pos - p));
                ofile.close();
            }
            free(image);
        }
    }
}

/************************************************
*采用遍历的方式获取网页中的url
*参数：
*     url - 目标url
*返回值：
*        无
*************************************************/
void WebSpider(const string &url) {
    char * response;
    int bytes;
    // 获取网页的相应，放入response中。
    if (!GetHttpResponse(url, response, bytes)) {
        cout << "The url is wrong! ignoring..." << endl;
        return;
    }
    string httpResponse = response;
    free(response);
    string filename = UrlToFileName(url);
    ofstream ofile("./html/" + filename);
    if (ofile.is_open()) {
        // 保存该网页的文本内容
        ofile << httpResponse << endl;
        ofile.close();
    }
    vector<string> imgurls;
    //解析该网页的属于本网站的网页链接，以及所有图片链接，放入imgurls里面
    HTMLParse(httpResponse, imgurls, url);

    //下载所有的图片资源
    DownLoadImg(imgurls, url);
}

void main(void)
{
    char cmd[64];
    sprintf_s(cmd,"color a");
    system(cmd);
    //初始化socket，用于tcp网络连接
    WSADATA wsaData;
    if (WSAStartup(MAKEWORD(2, 2), &wsaData) != 0) {
        return;
    }

    // 创建文件夹，保存图片和网页文本文件
    CreateDirectory("./img", 0);
    CreateDirectory("./html", 0);

    string  kongjie1 = "http://www.xinkongjie.com/d/file/meinv/mnkj/2019-02-26/6552ef0eafbbe01f358e9fd1a6cb8e54.jpg";

    //WebSpider(kongjie1);
    vector<string> imgurls;
    imgurls.push_back(kongjie1);

    DownLoadImg(imgurls, "www.xinkongjie.com/d");

    string urlStart = "http://www.xinkongjie.com/meinv/";

    // 使用广度遍历
    // 提取网页中的超链接放入hrefUrl中，提取图片链接，下载图片。
    WebSpider(urlStart);

    // 访问过的网址保存起来
    visitedUrl.insert(urlStart);

    while (hrefUrl.size() != 0) {
    string url = hrefUrl.front();  // 从队列的最开始取出一个网址
    cout << url << endl;
    WebSpider(url);                      // 遍历提取出来的那个网页，找它里面的超链接网页放入hrefUrl，下载它里面的文本，图片
    hrefUrl.pop();                 // 遍历完之后，删除这个网址

    if (g_ImgCnt > 50) break;
    }

    WSACleanup();
    system("pause");
    return;
}

#define MAX_URL_LEN 2083 //最大的url 的长度
#define HOST_NAME 256

#pragma comment(lib, "ws2_32.lib")
using namespace std;

#define DEFAULT_PAGE_BUF_SIZE 1048576

queue<string> hrefUrl;
hash_set<string> visitedUrl;
hash_set<string> visitedImg;
//int g_depth = 0;
int g_ImgCnt = 1;

/*************************************
* 功能描述：对URL进行解析，得到主机名，资源名
*
* 返回值：成功 - ture , 失败 - false
*************************************/

bool ParseURL(const string & url, string & host, string & resource) {
   char pHost[HOST_NAME];
   char pResource[MAX_URL_LEN];
   const char * pos = NULL;

   if (strlen(url.c_str()) > MAX_URL_LEN) {
       return false;
   }

//定位主机名称位置
pos = strstr(url.c_str(), "http://");

   if (pos == NULL) {
       pos = url.c_str();
   }
   else {
       pos += strlen("http://");
   }

   if (strstr(pos, "/") == 0) {
       return false;
   }

   sscanf_s(pos, "%[^/]%s", pHost, HOST_NAME, pResource, MAX_URL_LEN);
   host = pHost;
   resource = pResource;
   return true;
}

/********************************************
*功能：发送http Get请求到服务器，获取响应页面
*返回值：成功 - ture , 失败 - false
********************************************/
bool GetHttpResponse(const string &url, char * &response, int &bytesRead) {
   string host, resource;
   if (!ParseURL(url, host, resource)) {
       cout << "Can not parse the url" << endl;
       return false;
   }

   //解析域名，获取域名对应的ip地址
   struct hostent * hp = gethostbyname(host.c_str());
   if (hp == NULL) {
       cout << "Can not find host address" << endl;
       return false;
   }

   //创建套接字
   SOCKET sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
   if (sock == -1 || sock == -2) {
       cout << "Can not create sock." << endl;
       return false;
   }

   //建立服务器地址
   SOCKADDR_IN sa;
   sa.sin_family = AF_INET;
   sa.sin_port = htons(80);
   memcpy(&sa.sin_addr, hp->h_addr, 4);

   //建立连接
   if (0 != connect(sock, (SOCKADDR*)&sa, sizeof(sa))) {
       cout << "Can not connect to : " << url << endl;
       closesocket(sock);
       return false;
   };

//准备发送http 请求数据
string request = "GET " + resource + " HTTP/1.1\r\nHost:" + host + "\r\nConnection:Close\r\n\r\n";

   //发送数据
   if (SOCKET_ERROR == send(sock, request.c_str(), request.size(), 0)) {
       cout << "send error" << endl;
       closesocket(sock);
       return false;
   }

   //接收数据
   int contentLength = DEFAULT_PAGE_BUF_SIZE;
   char *pageBuf = (char *)malloc(contentLength);
   memset(pageBuf, 0, contentLength);

   bytesRead = 0;
   int ret = 1;
   cout << "Read: ";
   while (ret > 0) {
       ret = recv(sock, pageBuf + bytesRead, contentLength - bytesRead, 0);

       if (ret > 0)
       {
           bytesRead += ret;
       }

       if (contentLength - bytesRead<1024) {
           cout << "\nRealloc memorry" << endl;
           contentLength *= 2;
           pageBuf = (char*)realloc(pageBuf, contentLength); //重新分配内存
       }
       cout << " read: " << ret << " ";
   }
   cout << endl;

   pageBuf[bytesRead] = '\0';
   response = pageBuf;
   closesocket(sock);
   return true;

}

//
/********************************************
*功能：提取所有的网页URL以及图片对应的URL
*返回值：成功 - ture , 失败 - false
********************************************/
void HTMLParse(string & htmlResponse, vector<string> & imgurls, const string & host) {
   //找所有连接，加入queue中
   const char *p = htmlResponse.c_str();
   char *tag = "href=\"";
   const char *pos = strstr(p, tag);
   ofstream ofile("url.txt", ios::app);
   while (pos) {
       pos += strlen(tag);
       const char * nextQ = strstr(pos, "\"");
       if (nextQ) {
           char * url = new char[nextQ - pos + 1];
           //char url[100]; //固定大小的会发生缓冲区溢出的危险
           sscanf(pos, "%[^\"]", url);
           string surl = url; // 转换成string类型，可以自动释放内存
           if (visitedUrl.find(surl) == visitedUrl.end()) {
               visitedUrl.insert(surl);
               ofile << surl << endl;
               hrefUrl.push(surl);
           }
           pos = strstr(pos, tag);
           delete[] url; // 释放掉申请的内存
       }
   }
   ofile << endl << endl;
   ofile.close();

   tag = "<img ";
   const char* att1 = "src=\"";
   const char* att2 = "lazy-src=\"";
   const char *pos0 = strstr(p, tag);
   while (pos0) {
       pos0 += strlen(tag);
       const char* pos2 = strstr(pos0, att2);
       if (!pos2 || pos2 > strstr(pos0, ">")) {
           pos = strstr(pos0, att1);
           if (!pos) {
               pos0 = strstr(att1, tag);
               continue;
           }
           else {
               pos = pos + strlen(att1);
           }
       }
       else {
           pos = pos2 + strlen(att2);
       }

       const char * nextQ = strstr(pos, "\"");
       if (nextQ) {
           char * url = new char[nextQ - pos + 1];
           sscanf(pos, "%[^\"]", url);
           cout << url << endl;
           string imgUrl = url;
           if (visitedImg.find(imgUrl) == visitedImg.end()) {
               visitedImg.insert(imgUrl);
               imgurls.push_back(imgUrl);
           }
           pos0 = strstr(pos0, tag);
           delete[] url;
       }
   }
   cout << "end of Parse this html" << endl;
}

/*********************************************
*将URL转化为文件名,转换特殊字符避免保存失败
*参数：
* url - 待转换的url路径
*返回值：
* 转换后的字符串
*********************************************/
string UrlToFileName(const string &url) {
   string fileName;
   fileName.resize(url.size());
   int k = 0;
   for (int i = 0; i<(int)url.size(); i++) {
       char ch = url[i];
       if (ch != '\\'&&ch != '/'&&ch != ':'&&ch != '*'&&ch != '?'&&ch != '"'&&ch != '<'&&ch != '>'&&ch != '|') {
           fileName[k++] = ch;
       }
       else {
           fileName[k++] = '-';
       }
   }
   return fileName.substr(0, k) + ".txt";
}

/*********************************************
*下载图片列表并保存到img文件夹
*参数：
* url - 图片对应的路径，接下来会转换成保存图片的文件夹的名字
* immgurls - 待下载的图片列表
*返回值：
* 无
*********************************************/
void DownLoadImg(vector<string> & imgurls, const string &url) {

   //生成保存该url下图片的文件夹
   string foldname = UrlToFileName(url);
   foldname = "./img/" + foldname;
   if (!CreateDirectory(foldname.c_str(), NULL))
       cout << "Can not create directory:" << foldname << endl;
   char *image;
   int byteRead;
   for (unsigned int i = 0; i<imgurls.size(); i++) {
       //判断是否为图片，bmp，jgp，jpeg，gif
       string str = imgurls[i];
       int pos = str.find_last_of(".");
       if (pos == string::npos)
           continue;
       else {
           string ext = str.substr(pos + 1, str.size() - pos - 1);
           if (ext != "bmp"&& ext != "jpg" && ext != "jpeg"&& ext != "gif"&&ext != "png")
               continue;
       }
       //下载其中的内容
       if (GetHttpResponse(imgurls[i], image, byteRead)) {
           if (strlen(image) == 0) {
               continue;
           }
           const char *p = image;
           const char * pos = strstr(p, "\r\n\r\n") + strlen("\r\n\r\n");
           int index = imgurls[i].find_last_of("/");
           if (index != string::npos) {
               string imgname = imgurls[i].substr(index, imgurls[i].size());
               ofstream ofile(foldname + imgname, ios::binary);
               if (!ofile.is_open())
                   continue;
               cout << g_ImgCnt++ << foldname + imgname << endl;
               ofile.write(pos, byteRead - (pos - p));
               ofile.close();
           }
           free(image);
       }
   }
}

/************************************************
*采用遍历的方式获取网页中的url
*参数：
* url - 目标url
*返回值：
* 无
*************************************************/
void WebSpider(const string &url) {
   char * response;
   int bytes;
   // 获取网页的相应，放入response中。
   if (!GetHttpResponse(url, response, bytes)) {
       cout << "The url is wrong! ignoring..." << endl;
       return;
   }
   string httpResponse = response;
   free(response);
   string filename = UrlToFileName(url);
   ofstream ofile("./html/" + filename);
   if (ofile.is_open()) {
       // 保存该网页的文本内容
       ofile << httpResponse << endl;
       ofile.close();
   }
   vector<string> imgurls;
   //解析该网页的属于本网站的网页链接，以及所有图片链接，放入imgurls里面
   HTMLParse(httpResponse, imgurls, url);

//下载所有的图片资源
DownLoadImg(imgurls, url);
}

void main(void)
{
   char cmd[64];
   sprintf_s(cmd,"color a");
   system(cmd);
   //初始化socket，用于tcp网络连接
   WSADATA wsaData;
   if (WSAStartup(MAKEWORD(2, 2), &wsaData) != 0) {
       return;
   }

   // 创建文件夹，保存图片和网页文本文件
   CreateDirectory("./img", 0);
   CreateDirectory("./html", 0);

string kongjie1 = "http://www.xinkongjie.com/d/file/meinv/mnkj/2019-02-26/6552ef0eafbbe01f358e9fd1a6cb8e54.jpg";

   //WebSpider(kongjie1);
   vector<string> imgurls;
   imgurls.push_back(kongjie1);

DownLoadImg(imgurls, "www.xinkongjie.com/d");

string urlStart = "http://www.xinkongjie.com/meinv/";

   // 使用广度遍历
   // 提取网页中的超链接放入hrefUrl中，提取图片链接，下载图片。
   WebSpider(urlStart);

// 访问过的网址保存起来
visitedUrl.insert(urlStart);

   while (hrefUrl.size() != 0) {
   string url = hrefUrl.front(); // 从队列的最开始取出一个网址
   cout << url << endl;
   WebSpider(url);                   // 遍历提取出来的那个网页，找它里面的超链接网页放入hrefUrl，下载它里面的文本，图片
   hrefUrl.pop(); // 遍历完之后，删除这个网址

if (g_ImgCnt > 50) break;
}

   WSACleanup();
   system("pause");
   return;
}