为了网络保存,还是发出来,后天去一搜索引擎公司实习,就写了一个小爬虫,以前从没接触过,写得很烂,只能作为草稿,以后有时间的话再改,实现了一个基本功能就是爬出一些网页来
说明:
1.基于winsock写的
2.爬网页采用深度优先方式,用递归实现的,实际应该自己写栈实现吧,不然容易栈溢出
3.解析网页我采用直接从硬盘一个个字符读数据,导致速度巨慢,应当先读入内存中再处理
4.合法的url判断不够准确,爬出一些非网页出来,或者一些正常网页又爬不出来
5.没有进行网页判重,即同一网页可能被搜索两次,这可以用hash表实现判重
6.http请求报头很随意
等等还有很多缺点,不过一点收获是知道了爬虫程序最基本的应该做些啥
#include <winsock2.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define DEFAULT_COUNT 1
#define DEFAULT_PORT 80
#define DEFAULT_BUFFER 2048
#define NAMELEN 100
#define FIRSTURL "http://www.163.com/"
#define FIRSTWEB "www.163.com"
#define MAXDEPTH 5 //递归深度
#define MAXWEBLEN 50
#define MAXURLLEN 200
#define MAXWEB 30 //抓取的页面数
char szServer[128], // Server to connect to
szMessage[1024], // Message to send to sever
int iPort= DEFAULT_PORT; // Port on server to connect to
int flag;
char szBuffer[DEFAULT_BUFFER];
int count=1;
//删除http响应报头
void Skiphead(FILE* f)
{
char s[100];
while(1)
{
fgets(s,100,f);
if(s[0]=='/n'||s[0]=='/r') return;
}
}
//http 请求报头
void Requestmessage(char* url)
{
szMessage[0]=0;
strcat(szMessage,"GET ");
strcat(szMessage,url);
strcat(szMessage," HTTP/1.1/r/n");
strcat(szMessage,"Connection: close/r/n");
strcat(szMessage,"User-agent: Mozilla/4.0/r/n");
strcat(szMessage,"Accept: text/html, image/gif, image/jpeg/r/n");
strcat(szMessage,"Accept-language:zh-cn/r/n");
strcat(szMessage,"/r/n");
}
//创建一个新的页面
int Creatweb(char* name)
{
FILE* tf=fopen("temp.html","r");
//char name[NAMELEN];
char temp[10];
char c;
char no[10];
if(fscanf(tf,"%c",&c)==EOF)
{
printf("temp.html 内容为空,抓取失败/n");
return 0;
}
while(1)
{
if(fscanf(tf,"%c",&c)==EOF)
{
printf("网页内容不含标题/n");
return 0;
}
if(c=='t'||c=='T')
{
if(fscanf(tf,"%c%c%c%c%c",temp,temp+1,temp+2,temp+3,temp+4)==EOF)
{
printf("网页内容不含标题/n");
return 0;
}
temp[5]=0;
if((strcmp(temp,"itle>")==0)||(strcmp(temp,"ITLE>")==0))
{
int i=0;
fscanf(tf,"%c",&c);
while(c!='<')
{
name[i++]=c;
fscanf(tf,"%c",&c);
}
name[i]=0;
break;
}
}
}
memset(no,0,sizeof(no));
sprintf(no,"_%d",count++);
strcat(name,no);
strcat(name,".html");
fseek(tf,0,SEEK_SET);
Skiphead(tf);
FILE* nf=fopen(name,"w");
while(fscanf(tf,"%c",&c)!=EOF)
{
fprintf(nf,"%c",c);
fflush(nf);
}
fclose(tf);
fclose(nf);
printf("抓取页面: %s/n",name);
if(count-1==MAXWEB)
{
printf("共抓取 %d 个页面/n",count-1);
WSACleanup();
exit(0);
}
return 1;
}
//从url解析出域名
int Getweb(char* url,char* web)
{
char* s=url;
char* t=web;
if(*s=='h'&&*(s+1)=='t'&&*(s+2)=='t'&&*(s+3)=='p')s+=7;
else {
printf("非合法的url:%s/n",url);
return 0;
}
while(*s&&*s!='/') *web++=*s++;
*web=0;
printf("sever web:%s/n",t);
return 1;
}
void Crawlweb(char* url,int depth);
void AnalyseHtml(char* name,int depth)
{
if(depth+1>MAXDEPTH) return;
FILE* file=fopen(name,"r");
char nurl[MAXURLLEN];
char hr[10];
int i=0;
char c;
while(fscanf(file,"%c",&c)!=EOF)
{
if(c=='h')
{
if(fscanf(file,"%c%c%c%c%c",hr,hr+1,hr+2,hr+3,hr+4)==EOF) break;
hr[5]=0;
if(strcmp(hr,"ref=/"")==0)
{
i=0;
fscanf(file,"%c",&c);
while(c!='/"')
{
nurl[i++]=c;
fscanf(file,"%c",&c);
}
nurl[i]=0;
// printf("nurl:%s/n",nurl);
Crawlweb(nurl,depth+1);
}
}
}
fclose(file);
}
// url:待分析url depth:递归深度
void Crawlweb(char* url,int depth)
{
if(depth>MAXDEPTH) return;
printf("crawl url:%s/n",url);
SOCKET sClient;
int ret;
struct sockaddr_in server;
struct hostent *host = NULL;
FILE* webfile;
webfile=fopen("temp.html","w");
char name[NAMELEN];
char web[MAXWEBLEN];
if(!Getweb(url,web)) return ;
Requestmessage(url);
sClient = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
if (sClient == INVALID_SOCKET)
{
printf("socket() failed: %d/n", WSAGetLastError());
exit(1);
}
server.sin_family = AF_INET;
server.sin_port = htons(iPort);
host = gethostbyname(web);
if (host == NULL)
{
printf("Unable to resolve server: %s/n", szServer);
return ;
// exit(1);
}
CopyMemory(&server.sin_addr, host->h_addr_list[0],
host->h_length);
if (connect(sClient, (struct sockaddr *)&server,
sizeof(server)) == SOCKET_ERROR)
{
printf("connect() failed: %d/n", WSAGetLastError());
return ;
// exit(1);
}
//发送http请求报文
ret = send(sClient, szMessage, strlen(szMessage), 0);
if (ret == 0)
{
printf("send 0 byte failed: %d/n");
exit(1);
}
else if (ret == SOCKET_ERROR)
{
printf("send() failed: %d/n", WSAGetLastError());
exit(1);
}
printf("Send %d bytes/n", ret);
//抓取网页
int ff=1;
int cnt=0;
while(ret!=0)
{
ret = recv(sClient, szBuffer, DEFAULT_BUFFER, 0);
cnt+=ret;
if (ret == 0)
break;
else if (ret == SOCKET_ERROR)
{
printf("recv() failed: %d/n", WSAGetLastError());
break;
}
szBuffer[ret] = '/0';
fprintf(webfile,"%s",szBuffer);
fflush(webfile);
}
if(cnt&&Creatweb(name)) AnalyseHtml(name,depth);
closesocket(sClient);
fclose(webfile);
}
int main(int argc, char **argv)
{
WSADATA wsd;
count=1;
if (WSAStartup(MAKEWORD(2,2), &wsd) != 0)
{
printf("Failed to load Winsock library!/n");
return 1;
}
Crawlweb(FIRSTURL,0);
WSACleanup();
return 0;
}