新建工程选择windows服务(示例程序的开发平台是VS2008)
工程名称是SpiderServer,将新建工程中的Service1类的类名改成SpiderServer.
在工程中添加安装类Installer1,代码如下:
namespace SpiderServer
{
[RunInstaller(true)]
public partial class Installer1 : Installer
{
private System.ServiceProcess.ServiceProcessInstaller spInstaller;
private System.ServiceProcess.ServiceInstaller sInstaller;
public Installer1()
{
InitializeComponent();
this.spInstaller = new System.ServiceProcess.ServiceProcessInstaller();
this.sInstaller = new System.ServiceProcess.ServiceInstaller();
// 设定ServiceProcessInstaller对象的帐号、用户名和密码等信息
this.spInstaller.Account = System.ServiceProcess.ServiceAccount.LocalSystem;
this.spInstaller.Username = null;
this.spInstaller.Password = null;
// 设定服务名称
this.sInstaller.ServiceName = "SpiderServer";
this.sInstaller.Description = "网络爬虫";
// 设定服务的启动方式
this.sInstaller.StartType = System.ServiceProcess.ServiceStartMode.Automatic;
this.Installers.AddRange(new System.Configuration.Install.Installer[] { this.spInstaller, this.sInstaller });
}
}
}
引用程序集Spider,现在我们需要为Spider添加职责链处理过程,自定义新类ChainNode,该类继承AbsChain
重写方法Process。完成从HTML中获取所有的超链接,并将有效超链接添加到UrlStack中,同时获取当前HTML中的Title值,将其写入数据表SpiderTable中源码如下:
namespace SpiderServer
{
class ChainNode : WebSpider.AbsChain
{
protected override void Process(string html)
{
try
{
Regex re = new Regex(@"href=(?<web_url>[/s/S]*?)>|href=""(?<web_url>[/s/S]*?)""|href='(?<web_url>[/s/S]*?)'");
MatchCollection mc = re.Matches(html);
foreach (Match m in mc)
{
string url = m.Groups["web_url"].ToString();
//去除头部的'与"
if ((url.IndexOf("'") == 0) || (url.IndexOf("/"") == 0))
{
url = url.Remove(0, 1);
if (url.IndexOf("'") != -1)
{
url = url.Remove(url.IndexOf("'"), 1);
}
if (url.IndexOf("/"") != -1)
{
url = url.Remove(url.IndexOf("/""), 1);
}
}
if (url.IndexOf(" ") != -1)
{
url = url.Remove(url.IndexOf(" "));
}
if (url.IndexOf("http://") != -1)
{
WebSpider.UrlStack.Instance.Push(url);
}
}
string title = string.Empty;
re = new Regex(@"<title[/s/S]*?>(?<title>[/s/S]*?)</title>");
Match temp = re.Match(html.ToLower());
title = temp.Groups["title"].ToString();
if (!string.IsNullOrEmpty(title))
{
AddUrl(this.Url, title);
}
}
catch
{
}
}
private void AddUrl(string url, string title)
{
using (System.Data.SqlClient.SqlConnection conn = new System.Data.SqlClient.SqlConnection())
{
conn.ConnectionString = System.Configuration.ConfigurationManager.AppSettings["DB"];
conn.Open();
using (System.Data.SqlClient.SqlCommand cmd = conn.CreateCommand())
{
cmd.CommandText = "AddWeb";
cmd.CommandType = System.Data.CommandType.StoredProcedure;
cmd.Parameters.AddWithValue("@url", url);
cmd.Parameters.AddWithValue("@title", title);
cmd.ExecuteNonQuery();
}
}
}
}
}
自定义类MyServer继承AbsThreadManager,重写GetChainHeader方法,告诉程序处理职责链的处理头节点
namespace SpiderServer
{
class MyServer : WebSpider.AbsThreadManager
{
protected override WebSpider.AbsChain GetChainHeader()
{
return new ChainNode();
}
}
}
完成启动服务与停止服务过程
amespace SpiderServer
{
public partial class SpiderServer : ServiceBase
{
private MyServer server = new MyServer();
public SpiderServer()
{
InitializeComponent();
}
protected override void OnStart(string[] args)
{
using (SqlConnection conn = new SqlConnection())
{
conn.ConnectionString = System.Configuration.ConfigurationManager.AppSettings["DB"];
conn.Open();
using (SqlCommand cmd = conn.CreateCommand())
{
cmd.CommandText = "select Url from TempSplider";
cmd.CommandType = CommandType.Text;
using (SqlDataReader dr = cmd.ExecuteReader())
{
while (dr.Read())
{
WebSpider.UrlStack.Instance.Push(dr[0].ToString());
}
}
}
}
server.Start("");
}
protected override void OnStop()
{
server.Stop();
using (SqlConnection conn = new SqlConnection())
{
conn.ConnectionString = System.Configuration.ConfigurationManager.AppSettings["DB"];
conn.Open();
using (SqlCommand cmd = conn.CreateCommand())
{
cmd.CommandType = CommandType.Text;
cmd.CommandText = "delete from TempSplider";
cmd.ExecuteNonQuery();
}
int count = WebSpider.UrlStack.Instance.Count;
for (int i = 0; i < count; i++)
{
string url = WebSpider.UrlStack.Instance.Pop();
using (SqlCommand cmd = conn.CreateCommand())
{
cmd.CommandType = CommandType.Text;
cmd.CommandText = "insert into TempSplider(Url) values(@url)";
cmd.Parameters.AddWithValue("@url", url);
cmd.ExecuteNonQuery();
}
}
}
}
}
}
编译工作,生成SpiderServer.exe文件,用.net自带的installutil.exe工作将服务安装好就可以了。记住,每次启动服务前在表TempSplider中都必须有URL记录的,因为程序要从表中装载URL到UrlStack中,工作线程就是通过UrlStack中取URL并获取相应的HTML的.
完
源码下载地址:http://download.youkuaiyun.com/source/460975