最近跟朋友一起写了一个 批量网站查询工具 BlueCatTools,其中,需要用C++解析HTTP下载下来的HTML文档。
懂的人不用我多说,不懂的我也没能力说道你懂,看代码吧。
BlueCatTools 百度收录批量查询工具
//--caller.cpp--
// to run the program, you should make sure that, there is a "NIKE新浪竞技风暴_新浪网.htm" in your working directory.
// The program run time can be saved about a half if you give a better implementation of the "ofile <<" stament;
#include "HtmlParser.h"
#include <ctime>
#include <iomanip>
using namespace std;
void main()
{
clock_t start = clock();
map<string, link_info> LinkInfo;
multimap<float, link_info, greater<float> > Sorted;
string FileName = "NIKE新浪竞技风暴_新浪网.htm";
HtmlParser(FileName, LinkInfo);
string Result;
for(map<string, link_info>::iterator miter = LinkInfo.begin(); miter != LinkInfo.end(); miter++)
{
Sorted.insert(make_pair(miter->second.Value, miter->second));
}
ofstream ofile;
ofile.open("a.txt");
for(multimap<float, link_info, greater<float> >::iterator miter = Sorted.begin(); miter != Sorted.end(); miter++)
{
ofile << miter->first << "\t"
<<setw(50) << left << miter->second.Title << "\t"
<< miter->second.Link << endl;
}
ofile.close();
cout << clock() - start << endl;
}
//--HtmlParser.h--/
#pragma once
#include <cstdio>
#include <iostream>
#include <fstream>
#include <string>
#include <map>
using namespace std;
struct link_info
{
float Value;
string Link;
string Title;
};
const int BUFFERSIZE = 10000;
const int LOOKUP = 100;
const int ASIZE = 300; //max length assumed of <a tag,
string RepairTitle(string& Title)
{
string Result = "";
for(string::iterator siter = Title.begin(); siter != Title.end(); siter++)
{
unsigned char ch = *siter;
if(ch == 0x0d || ch == 0x0a || ch == ' ' || ch == '\t')
{
if(*Result.rbegin() != '_')
Result.push_back('_');
}
else Result.push_back(ch);
}
return Result;
}
bool HtmlParser(const string& FileName, map<string, link_info>& LinkInfo)
{
int i = 2000;
FILE *fp;
size_t ReadIn;
char Dst[ASIZE];
char buffer[BUFFERSIZE + 1];
string Modified_Line;
fp = fopen(FileName.c_str(), "rb");
while(fp)
{
ReadIn = fread(buffer, 1, BUFFERSIZE, fp);
fseek(fp, - LOOKUP, SEEK_CUR);
if(ReadIn == LOOKUP) break;
buffer[ReadIn] = 0;
Modified_Line.clear();
char *p = buffer ;
while(*p)
{
unsigned ch = *p;
if(ch >= 'A' && ch <= 'Z') Modified_Line.push_back(ch + 32);
else Modified_Line.push_back(ch);
p++;
}
string::size_type pos0;
string::size_type pos1 = 0;
while((pos0 = Modified_Line.find("<a", pos1)) != string::npos)
{
string Atag, LAtag;
pos1 = Modified_Line.find("</a", pos0);
if(pos1 != string::npos){
if(pos1 - pos0 + 4 >= ASIZE) //make sure that Atag.size() < Asize
continue;
memset(Dst, 0, ASIZE);
Atag = strncpy(Dst, buffer + pos0, pos1 - pos0 + 4);
LAtag = Modified_Line.substr(pos0, pos1 - pos0 + 4);
link_info tmpLink;
{
string::size_type pos0, pos1;
pos1 = LAtag.find("</a");
while(LAtag[pos1 - 1] == '>')
{
pos1 = LAtag.find_last_of("<", pos1 - 1);
if(pos1 == 0) break;
}
pos0 = LAtag.find_last_of(">", pos1);
string tmpstr = Atag.substr(pos0 + 1, pos1 - pos0 - 1);
tmpLink.Title = RepairTitle(tmpstr);;
}
{
string::size_type pos0, pos1;
pos0 = LAtag.find("href",0);
pos0 = LAtag.find_first_not_of("=\"\' ",pos0 + 4); // ",', ,=
pos1 = LAtag.find_first_of("\"\' >", pos0 + 1); // ",', ,>
tmpLink.Link = Atag.substr(pos0, pos1 - pos0);
}
tmpLink.Value = (i--) * 0.0005;
if(tmpLink.Title.size() > 3 && tmpLink.Link.size() > 3) //filter: the filename.size() at least 3
LinkInfo.insert(make_pair(tmpLink.Link, tmpLink)); //filter: the Link must be unique
}
}
}
return true;
}