应用部分:
using System;
using System.Collections.Generic;
using System.Text;
using System.IO;
using System.Data;
using System.Text.RegularExpressions;
using System.Net;
using System.Threading;

namespace GetBrand ...{
class Program ...{
static void Main(string[] args) ...{
GetUrlAndDoWork();
Console.WriteLine("数据获取完毕,完任意键退出。。。");
Console.Read();
}
//处理第一次获取失败的url
private static void DoError() ...{
DataSet ds = new DBHelp().GetErrorDate();
if (ds != null && ds.Tables[0].Rows.Count > 0) ...{
for (int i = 0; i < ds.Tables[0].Rows.Count; i++) ...{
string url = ds.Tables[0].Rows[i]["url"].ToString();
SaveDate(url,i);
}
DataSet dsOther = new DBHelp().GetErrorDate();
if (dsOther != null && dsOther.Tables[0].Rows.Count > 0) ...{
DoError();
}
}
}
//获取url并处理数据
private static void GetUrlAndDoWork() ...{
for (int i = 1; i <= 813; i++) ...{
string url = "http://brand.chinasspp.com/Index-0-{0}.htm";
url = string.Format(url,i.ToString());
SaveDate(url,i);
}
DoError();
}
//获取数据保存到本地
private static void SaveDate(string url,int i) ...{
string content = ReturnByUrl(url);
if (!string.IsNullOrEmpty(content)) ...{
int start = content.IndexOf("</SELECT>");
int end = content.LastIndexOf("转到");

if (end > start) ...{
string WorkContent = content.Substring(start, end - start);
//Console.WriteLine(WorkContent);
Regex r1 = new Regex("<b>");
MatchCollection mc1 = r1.Matches(WorkContent);
Regex r2 = new Regex("</b>");
MatchCollection mc2 = r2.Matches(WorkContent);
Console.WriteLine("================开始获取第" + i.ToString() + "页数据==================");
for (int k = 0; k < mc1.Count; k++) ...{
//Console.WriteLine(mc1[k].Index.ToString());
string BrandName = WorkContent.Substring(mc1[k].Index + 3, mc2[k].Index - mc1[k].Index - 3);
k++;
string Company = WorkContent.Substring(mc1[k].Index + 3, mc2[k].Index - mc1[k].Index - 3);
if (IsValidate(BrandName)) ...{
Console.WriteLine(BrandName + "----" + Company);
// Add the match string to the DataBase.
//new DBHelp().Add(BrandName, Company);
Brand model = new Brand();
model.BrandName = BrandName;
model.ComPan = Company;
model.Pic = "";
model.Create = DateTime.Now;
new DBHelp().Add(model);
}
}
Console.WriteLine("================本页获取数据结束=====================");
}
}
}
//验证数据的有效性
private static bool IsValidate(string _str) ...{
return !Regex.IsMatch(_str, @"^-?d+$");
}
//根据url返回请求的内容
private static string ReturnByUrl(string url) ...{
string responseFromServer = string.Empty;
try ...{
WebRequest request = WebRequest.Create(url);
request.Credentials = CredentialCache.DefaultCredentials;
WebResponse response = request.GetResponse();
Stream dataStream = response.GetResponseStream();
StreamReader reader = new StreamReader(dataStream, Encoding.Default);
responseFromServer = reader.ReadToEnd();
reader.Close();
response.Close();
}
catch ...{
//保存日志
new DBHelp().AddLog(url);
//Ucar.Common.LogHelper.ErrorLog(e, @"D:");
//Thread.Sleep(500000);
//IsValidate(url);
}
return responseFromServer;
}
}
}
数据交互部分:
using System;
using System.Collections.Generic;
using System.Text;
using System.Data;
using System.Data.SqlClient;
namespace GetBrand ...{
class DBHelp ...{
private string SqlConnection = "server=.;database=pubs;uid=sa;pwd=123123;";

public void Add(string BrandName,string Conmpany) ...{
string sql = "insert into Brand (BrandName,ComPan,[Create])values('" + BrandName + "','" + Conmpany + "','" + DateTime.Now.ToString() + "')";
Ucar.BaseClass.DbHelperSQL.ExecuteSql(sql,SqlConnection);
}
public void Add(Brand model)
...{
StringBuilder strSql=new StringBuilder();
strSql.Append("insert into Brand(");
strSql.Append("BrandName,ComPan,Pic,[Create])");
strSql.Append(" values (");
strSql.Append("@BrandName,@ComPan,@Pic,@Create)");
SqlParameter[] parameters = ...{
new SqlParameter("@BrandName", SqlDbType.VarChar,50),
new SqlParameter("@ComPan", SqlDbType.VarChar,100),
new SqlParameter("@Pic", SqlDbType.VarChar,100),
new SqlParameter("@Create", SqlDbType.DateTime)};
parameters[0].Value = model.BrandName;
parameters[1].Value = model.ComPan;
parameters[2].Value = model.Pic;
parameters[3].Value = model.Create;
Ucar.BaseClass.DbHelperSQL.ExecuteSql(strSql.ToString(), SqlConnection, parameters);
}

public DataSet GetErrorDate() ...{
string sql = "select * from Log where IsValidata=0 and DoTimes<=3";
return Ucar.BaseClass.DbHelperSQL.Query(sql, SqlConnection);
}
public bool Exists(string url) ...{
StringBuilder strSql = new StringBuilder();
strSql.Append("select count(1) from Log");
strSql.Append(" where url= @url");
SqlParameter[] parameters = ...{
new SqlParameter("@url", SqlDbType.VarChar,50)
};
parameters[0].Value = url;
return Ucar.BaseClass.DbHelperSQL.Exists(strSql.ToString(), SqlConnection, parameters);
}

public void AddLog(string url) ...{
string sql = string.Empty;
if (Exists(url)) ...{
sql = "update Log set DoTimes=DoTimes+1 where url ='" + url + "'";
}
else ...{
sql = "insert into Log (url,IsValidata,DoTimes)values('" + url + "','0',0)";
}
Ucar.BaseClass.DbHelperSQL.ExecuteSql(sql, SqlConnection);
}
}
public class Brand ...{
public Brand() ...{ }
Model#region Model
private int _id;
private string _brandname;
private string _compan;
private string _pic;
private DateTime _create;
/**//// <summary>
///
/// </summary>
public int ID ...{
set ...{ _id = value; }
get ...{ return _id; }
}
/**//// <summary>
///
/// </summary>
public string BrandName ...{
set ...{ _brandname = value; }
get ...{ return _brandname; }
}
/**//// <summary>
///
/// </summary>
public string ComPan ...{
set ...{ _compan = value; }
get ...{ return _compan; }
}
/**//// <summary>
///
/// </summary>
public string Pic ...{
set ...{ _pic = value; }
get ...{ return _pic; }
}
/**//// <summary>
///
/// </summary>
public DateTime Create ...{
set ...{ _create = value; }
get ...{ return _create; }
}
#endregion Model
}
}
SQL脚本:
if exists (select * from dbo.sysobjects where id = object_id(N'[dbo].[Brand]') and OBJECTPROPERTY(id, N'IsUserTable') = 1)
drop table [dbo].[Brand]
GO
CREATE TABLE [dbo].[Brand] (
[ID] [int] IDENTITY (1, 1) NOT NULL ,
[BrandName] [varchar] (50) COLLATE Chinese_PRC_CI_AS NULL ,
[ComPan] [varchar] (100) COLLATE Chinese_PRC_CI_AS NULL ,
[Pic] [varchar] (100) COLLATE Chinese_PRC_CI_AS NULL ,
[Create] [datetime] NULL
) ON [PRIMARY]
GO
运行截图:

总结:简单的可以针对某个网站来进行数据抓取,但普遍性的抓取数据还在研究中,由于各个网站的风格可能不一致,所以本程序的局限性很大,扩展性很差...
本文介绍了一个简单的网站数据抓取程序实现,通过C#语言利用正则表达式从指定网页中提取品牌名称和公司信息,并将数据存储到SQL Server数据库中。该程序包括数据获取、错误处理及数据库操作等功能。
1815

被折叠的 条评论
为什么被折叠?



