// WebWormpro.cpp : 定义控制台应用程序的入口点。
//
#include "stdafx.h"
#include <cstdio>
#include <iostream>
#include <fstream>
#include <string>
#include <cstring>
#include <regex>//正则表达式
#include <vector>
#include <queue>
#include <algorithm>
#include <map>
#include <WinSock2.h>//网络套接字头文件
#pragma comment(lib,"ws2_32.lib") //网络通信需要
using namespace std;
char host[500]; //存放主机数组
int num = 1;
char othPath[800];
string allHtml;
vector <string> photoUrl;
vector <string> comUrl;
map <string, int> mp;
SOCKET sock;
bool analyUrl(char *url) //分析url
{
char *pos = strstr(url, "http://");
if (pos == NULL)
return false;
else
pos += 7;//跳过http://
sscanf(pos, "%[^/]%s", host, othPath);//http:// 后一直到/之前的是主机名
cout << "host: " << host << " repath:" << othPath << endl;
return true;
}
void regexGetimage(string &allHtml)
//C++11正则表达四提取图片上的url
//预处理程序把所有的部分连接起来形成给定匹配元素的正则表达式
{
smatch mat;
regex pattern("src=\"(.*?\.jpg)\"");
string::const_iterator start = allHtml.begin();//迭代器开始
string::const_iterator end = allHtml.end();//迭代器开始
while (regex_search(start, end, mat, pattern))
{
string msg(mat[1].first, mat[1].second);
photoUrl.push_back(msg);
start = mat[0].second;
}
}
void regexGetcom(string &allHtml)
//提取网页中的http://的url
{
smatch mat;
regex pattern("href=\"(http://[^\s'\"]+)\"");
string::const_iterator start = allHtml.begin();
string::const_iterator end = allHtml.end();
while (regex_search(start,end,mat,pattern))
{
string msg(mat[1].first, mat[1].second);
comUrl.push_back(msg);
start = mat[0].second;
}
}
void preConnect()//socket进行网络连接
{
WSADATA wd;
WSAStartup(MAKEWORD(2, 2), &wd);//windows异步套接字的启动命令
sock = socket(AF_INET, SOCK_STREAM, 0);
if (sock == INVALID_SOCKET)
{
cout << "建立socket失败,错误码:" << WSAGetLastError() << endl;
return;
}
sockaddr_in sa = { AF_INET };//windows或Linux下的网络编程结构
int n = bind(sock, (sockaddr*)&sa, sizeof(sa));//绑定套接字
if (n == SOCKET_ERROR)
{
cout << "bind函数失败,错误码:" << WSAGetLastError() << endl;
return;
}
struct hostent *p = gethostbyname(host);//根据主机名得到主机信息
if (p == NULL)
{
cout << "主机无法解析出ip!!错误" << WSAGetLastError() << endl;
return;
}
sa.sin_port=htons(80);//主机字节顺序表达的16位数字
memcpy(&sa.sin_addr, p->h_addr, 4);
n = connect(sock, (sockaddr*)&sa, sizeof(sa));//发起连接请求
if (n == SOCKET_ERROR)
{
cout << "connect函数失败!错误码:" << WSAGetLastError() << endl;
return;
}
//向服务器发送GET请求,模拟浏览器的请求下载图片
string reqInfo = "GET " + (string)othPath + " HTTP/1.1\r\nHost: " + (string)host + "\r\nConnection:Close\r\n\r\n";
if (SOCKET_ERROR == send(sock, reqInfo.c_str(), reqInfo.size(),0))//发送数据
{
cout << "send error!!错误码:" << WSAGetLastError() << endl;
closesocket(sock);
return;
}
}
void OutImage(string imageurl)//将图片命名,保存在目录
{
int n;
char tmep[800];
strcpy(tmep, imageurl.c_str());
analyUrl(tmep);//仅仅支持http协议,解析出主机和IP地址
preConnect();//socket进行网络连接
string photoname;
photoname.resize(imageurl.size());//设置大小
int k = 0;
for (int i = 0; i < imageurl.length(); i++)
{
char ch = imageurl[i];
if (ch != '\\'&& ch != '/'&&ch != ':'&&ch != '*'&&ch != '?'&&ch != '"'&& ch != '<'&&ch != '>'&&ch != '|')
photoname[k++] = ch;
}
photoname = "./imgData/" + photoname.substr(0, k) + ".jpg";
fstream file;
file.open(photoname, ios::out | ios::binary);
char buf[1024];
memset(buf, 0, sizeof(buf));
//与正确的区别差了下面这句
n = recv(sock, buf, sizeof(buf) - 1, 0);
char *cpos = strstr(buf, "\r\n\r\n");
file.write(cpos + strlen("\r\n\r\n"), n - (cpos - buf) - strlen("\r\n\r\n"));
while ((n = recv(sock, buf, sizeof(buf) - 1, 0)) > 0)
{
file.write(buf, n);
}
file.close();
}
void PutImagetoSet()//解析整个html代码
{
int n;
char buf[1024];
while ((n = recv(sock, buf, sizeof(buf) - 1, 0)) > 0)
{
buf[n] = '\0';
allHtml += string(buf);
}
regexGetimage(allHtml);//C++正则表达式提取图片url
regexGetcom(allHtml);//提取网页中的http://的url
}
void bfs(string beginUrl)//宽度优先搜索算法
{
queue<string> q;
q.push(beginUrl);
while (!q.empty())
{
string cur = q.front();
mp[cur]++;
q.pop();
char tmp[800];
strcpy(tmp, cur.c_str());
analyUrl(tmp);//得到主机名和子链接
preConnect();//连接服务器GET发送请求图片数据
PutImagetoSet();
vector<string>::iterator ita = photoUrl.begin();
for (ita; ita != photoUrl.end(); ita++)
{
OutImage(*ita);
}
photoUrl.clear();
vector<string>::iterator it = comUrl.begin();
for (it; it != comUrl.end(); it++)
{
if (mp[*it] == 0)
q.push(*it);
}
comUrl.clear();
}
}
int _tmain(int argc, _TCHAR * argv[])
{
cout << "\n\n\n\n\t\t\t******************************************" << endl;
cout << "\t\t\t** 爬虫 **" << endl;
cout << "\t\t\t******************************************" << endl;
cout << "\t\t\t输入网址:" ;
string srul;
cin >> srul;
CreateDirectoryA("./imgData", 0);
//测试可用srul为http://www.27270.com/
bfs(srul);
return 0;
}