新华网内容抓取与存储-优快云博客

本文链接：https://blog.youkuaiyun.com/xieweiman/article/details/6056777

做数据采集系统的朋友可能遇到过采集一些著名网站的数据填入自己的数据库中的问题。

现在我将自己做的采集数据的方法贴出来大家一起看看

我首先使用WebClient传入一个目录的网址这样更好的分类处理

转换字符串的格式

使用正则表达式分析出字符串中的所有超链接

分析这些超链接这样就能够去除一部分节省资源

循环分析这些超链接

得到内容页

再次进行内容分析

想新浪，新华，搜狐等的内容页都有开始结束的标志

下面我以新华网为例进行分析读取并插入到数据库

使用WebClient 传染地址下载网页得到网页的信息

WebClient w = new WebClient();
byte[] bytes = w.DownloadData(“http://www.xinhuanet.com/politics/xw.htm”);

把网页转换为UTF-8格式的字符串
string strHtml = System.Text.Encoding.UTF8.GetString(bytes);

使用正则表达式分析出网页中的所有的超链接

string p = @"/<a.*href/s*=/s*(?:""(?<url>[^""]*)""|'(?<url>[^']*)'|(?<url>[^/>^/s]+)).*/>(?<title>[^/<^/>]*)/<[^/</a/>]*/a/>";

System.Text.RegularExpressions.Regex reg = new System.Text.RegularExpressions.Regex(p, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
System.Text.RegularExpressions.MatchCollection ms = reg.Matches(strHtml);

使用foreach 循环读出网页的超链接网址

foreach (System.Text.RegularExpressions.Match m in ms)
{

剔除长度太短的超链接

if (m.Groups["url"].Value.Length < 10)
{

            }
            else
            {

剔除格式不一致的超链接网址
if (Convert.ToString(m.Groups["url"].Value).Substring(m.Groups["url"].Value.Length - 4, 4).Equals(".htm"))
{

根据网址使用方法得到新的页面信息

string detailfirst = GetWebContent(Convert.ToString(m.Groups["url"].Value));

分析网页内容如果网页中村在 新华网中内容夜开始
                    int iBodyStart = detailfirst.IndexOf("", 0);
                    if (iBodyStart > 0)
                    {
                        int iStart = detailfirst.IndexOf("", iBodyStart);
                        if (iStart > 0)
                        {
                            int iTableStart = detailfirst.IndexOf("", iStart);
                            if (iTableStart > 0)
                            {

新华网中内容夜结束
                                int iTableEnd = detailfirst.IndexOf("", iTableStart);
                                if (iTableEnd > 0)
                                {

截取开始到结束的直接的内容

                                           string strWeb = detailfirst.Substring(iTableStart, iTableEnd - iTableStart);
                                                if (strWeb.Length > 0)
                                                {
                                                    strWeb = GetTitle(strWeb); //过滤内容中的特殊标点符号

newsdetail = strWeb;
string column_id = "";

使用sql语句直接插入到数据库中
                                                    string sql = "insert into columns(title,detail) values('" + title + "','" + newsdetail + "')";
                                                    if (Class1.AddUser(sql))
                                                    {

                                                    }
                                                }
                                            }
                                        }

}
}

                            }
                        }
                    }

//根据Url地址得到网页的html源码
    private static string GetWebContent(string Url)
    {
        string strResult = "";
        try
        {
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
            //声明一个HttpWebRequest请求
            request.Timeout = 30000;
            //设置连接超时时间
            request.Headers.Set("Pragma", "no-cache");
            HttpWebResponse response = (HttpWebResponse)request.GetResponse();
            Stream streamReceive = response.GetResponseStream();
            Encoding encoding = Encoding.GetEncoding("utf-8");
            StreamReader streamReader = new StreamReader(streamReceive, encoding);
            strResult = streamReader.ReadToEnd();
        }
        catch
        {
            //MessageBox.Show("出错");
        }
        return strResult;
    }