我们平时或多或少的都需要采集一些网络上面的信息,当时采集的方法会有很多种,为了更高效的采集数据,我们基本上都要使用多线程,采集下来内容,最关键的还是需要分析网页内容,我们可以使用正则来分析网页中的内容,今天我们采集 HtmlAgilityPack 类库。
使用的工具类库包括:HtmlAgilityPack,以及苏飞的一个 HttpHelper 类,开发环境用的 VisualStudio 2008,.NetFramework 2.0,最终结果如图所示:
同时也看到几个最主要的类,这儿采集工厂模式,目的是让扩展更加容易一些,CollectorFactoryManager.cs 代码如下:
using System;
using System.Collections.Generic;
namespace CollectDemo
{
/// <summary>
/// 采集工厂管理类
/// </summary>
public class CollectorFactoryManager
{
private const int initCount = 5;
private IList<CollectorFactory> factoryList;
private Action callback;
private int collectFactoryIndex;
public CollectorFactoryManager(Action callback)
{
this.callback = callback;
this.factoryList = new List<CollectorFactory>();
// 可以无限添加
this.factoryList.Add(new CollectorFactoryOne("http://www.cnblogs.com/", this.CollectorFactoryCalback));
this.factoryList.Add(new CollectorFactoryOne("http://www.cnblogs.com/sitehome/p/2", this.CollectorFactoryCalback));
}
// 开始采集
public void Run()
{
this.collectFactoryIndex = -1;
// 因为线程有最大上限,设置初始采集数量
for (int index = 0; index < initCount && index < this.factoryList.Count; index++)
{
this.CollectorFactoryData();
}
}
private void CollectorFactoryData()
{
lock (this)
{
this.collectFactoryIndex++;
//采集未结束,顺序采集
if (this.collectFactoryIndex < this.factoryList.Count)
{
CollectorFactory collectorFactory = this.factoryList[this.collectFactoryIndex];
collectorFactory.Run();
}
else
{
// 采集结束
this.End();
}
}
}
public void CollectorFactoryCalback()
{
this.CollectorFactoryData();
}
/// <summary>
/// 采集结束
/// </summary>
public void End()
{
if (this.callback != null) this.callback();
}
}
}
CollectorFactory.cs 代码如下:
using System;
using System.Collections.Generic;
using System.Threading;
using HtmlAgilityPack;
namespace CollectDemo
{
public class CollectorFactory
{
private const int initCount = 10;
protected string htmlText;
protected string urlPath;
protected IList<CollectorItem> collectorItemList;
protected Action callback;
protected int collectItemIndex;
public CollectorFactory(string urlPath, Action callback)
{
this.urlPath = urlPath;
this.callback = callback;
}
/// <summary>
/// 启动采集
/// </summary>
public virtual void Run()
{
// 添加睡眠,避免请求被当成爬虫
int sleepData = new Random().Next(1000, 3000);
Thread.Sleep(sleepData);
Thread thread = new Thread(new ThreadStart(this.Start));
thread.Start();
}
/// <summary>
/// 开启线程
/// </summary>
protected virtual void Start()
{
this.CreateAndGetHtmlContent();
this.AnalysisHtmlContent();
this.CollectorPageData();
}
/// <summary>
/// 创建采集请求信息
/// </summary>
protected virtual void CreateAndGetHtmlContent()
{
}
/// <summary>
/// 分析采集数据
/// </summary>
protected virtual void AnalysisHtmlContent()
{
}
protected virtual void CollectorPageData()
{
this.collectItemIndex = -1;
if (this.collectorItemList != null && this.collectorItemList.Count > 0)
{
for (int index = 0; index < initCount && index < this.collectorItemList.Count; index++)
{
this.CollectorItemData();
}
}
}
public virtual void CollectorItemData()
{
lock (this)
{
this.collectItemIndex++;
if (this.collectItemIndex < this.collectorItemList.Count)
{
CollectorItem collectorItem = this.collectorItemList[this.collectItemIndex];
collectorItem.Run();
}
else
{
// 采集结束
this.End();
}
}
}
public void CollectorItemCalback()
{
this.CollectorItemData();
}
public virtual void End()
{
if (this.callback != null) this.callback();
}
}
}
CollectorItem.cs 代码如下:
using System;
using System.Collections.Generic;
using System.Threading;
using HtmlAgilityPack;
namespace CollectDemo
{
public class CollectorItem
{
protected string htmlText;
protected CollectorFactory collectorFactory;
protected string urlPath;
protected Action callback;
public CollectorItem(CollectorFactory collectorFactory, string urlPath, Action callback)
{
this.collectorFactory = collectorFactory;
this.urlPath = urlPath;
this.callback = callback;
}
public void Run()
{
// 添加睡眠,避免请求被当成爬虫
int sleepData = new Random().Next(2000, 6000);
Thread.Sleep(sleepData);
Thread thread = new Thread(new ThreadStart(this.Start));
thread.Start();
}
/// <summary>
/// 开启线程
/// </summary>
protected virtual void Start()
{
this.CreateAndGetHtmlContent();
this.AnalysisHtmlContent();
}
/// <summary>
/// 创建采集请求信息
/// </summary>
protected virtual void CreateAndGetHtmlContent()
{
}
/// <summary>
/// 分析采集数据
/// </summary>
protected virtual void AnalysisHtmlContent()
{
}
public virtual void End()
{
if (this.callback != null) this.callback();
}
}
}
本例子采集的是博客园的前两页数据,所以我们需要一个解析两页数据链接的 CollectorFactoryOne.cs 类,代码如下:
using System;
using System.Collections.Generic;
using System.Threading;
using HtmlAgilityPack;
namespace CollectDemo
{
public class CollectorFactoryOne : CollectorFactory
{
public CollectorFactoryOne(string urlPath, Action callback) : base(urlPath, callback)
{
}
protected override void CreateAndGetHtmlContent()
{
HttpItem httpItem = new HttpItem();
httpItem.URL = this.urlPath;
httpItem.Method = "get";
httpItem.UserAgent = "Mozilla/5.0 (Windows NT 5.1; rv:24.0) Gecko/20100101 Firefox/24.0";
httpItem.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
HttpResult httpResult = new HttpHelperUtils().GetHtml(httpItem);
this.htmlText = httpResult.Html;
}
protected override void AnalysisHtmlContent()
{
HtmlDocument htmlDocument = new HtmlDocument();
htmlDocument.LoadHtml(this.htmlText);
this.collectorItemList = new List<CollectorItem>();
HtmlNodeCollection hrefList = htmlDocument.DocumentNode.SelectNodes("//a[@class = 'titlelnk']");
if (hrefList != null)
{
foreach (HtmlNode hrefNode in hrefList)
{
HtmlAttribute htmlAttribute = hrefNode.Attributes["href"];
this.collectorItemList.Add(new CollectorItemOne(this, htmlAttribute.Value, this.CollectorItemCalback));
}
}
}
}
}
还有一个解析博客园每页内容的 CollectorItemOne.cs 类,代码如下:
using System;
using System.Collections.Generic;
using System.Threading;
using HtmlAgilityPack;
using System.IO;
namespace CollectDemo
{
public class CollectorItemOne : CollectorItem
{
public CollectorItemOne(CollectorFactory collectorFactory, string urlPath, Action callback)
: base(collectorFactory, urlPath, callback)
{
}
protected override void CreateAndGetHtmlContent()
{
HttpItem httpItem = new HttpItem();
httpItem.URL = this.urlPath;
httpItem.Method = "get";
httpItem.UserAgent = "Mozilla/5.0 (Windows NT 5.1; rv:24.0) Gecko/20100101 Firefox/24.0";
httpItem.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
HttpResult httpResult = new HttpHelperUtils().GetHtml(httpItem);
this.htmlText = httpResult.Html;
}
protected override void AnalysisHtmlContent()
{
HtmlDocument htmlDocument = new HtmlDocument();
htmlDocument.LoadHtml(this.htmlText);
lock (this)
{
string htmlTitle = htmlDocument.DocumentNode.SelectSingleNode("//title").InnerText;
// 这儿创建文件
string filePath = System.Windows.Forms.Application.StartupPath + "\\txt\\";
filePath += System.Guid.NewGuid() + ".txt";
if (File.Exists(filePath)) return;
File.Create(filePath).Close();
try
{
using (StreamWriter streamWriter = new StreamWriter(filePath, true, System.Text.Encoding.UTF8))
{
streamWriter.Write(htmlDocument.DocumentNode.InnerHtml);
streamWriter.Flush();
streamWriter.Close();
}
}
catch (Exception ex)
{
// 处理错误
}
// 处理结束,这儿必须调用
this.End();
}
}
}
}
主要的多线程操作都已经封装好,只需要处理采集以及解析网页内容就可以实现快速扩展了。
本文介绍了一个使用HtmlAgilityPack和多线程技术的博客园爬虫设计案例,通过工厂模式实现了灵活扩展,采集并解析了博客内容。

6064





