由于经常要写一些采集的程序,下面的三个函数是采集中的很常用的函数。姑且叫采集系统万能正则表达式吧。全部源码见
http://www.softbk.com/news.asp?id=3564 <script type="text/javascript"><!-- google_ad_client = "pub-4334685396432654"; //468x15, 创建于 07-12-5 google_ad_slot = "3722935453"; google_ad_width = 468; google_ad_height = 15; //--></script><script src="http://pagead2.googlesyndication.com/pagead/show_ads.js" type="text/javascript"> </script> 欢迎一起交流
//
获取页面的html源码
public
string
GetHtmlSource(
string
Url,
string
charset)

{
if (charset == "" || charset == null) charset = "gb2312";
string text1 = "";
try

{
HttpWebRequest request1 = (HttpWebRequest)WebRequest.Create(Url);
HttpWebResponse response1 = (HttpWebResponse)request1.GetResponse();
Stream stream1 = response1.GetResponseStream();
StreamReader reader1 = new StreamReader(stream1, Encoding.GetEncoding(charset));
text1 = reader1.ReadToEnd();
stream1.Close();
response1.Close();
}
catch (Exception exception1)

{
}
return text1;
}
public
string
SniffwebCode(
string
code,
string
wordsBegin,
string
wordsEnd)

{
string NewsTitle = "";
Regex regex1 = new Regex("" + wordsBegin + @"(?<title>[\s\S]+?)" + wordsEnd + "", RegexOptions.Compiled | RegexOptions.IgnoreCase);
for (Match match1 = regex1.Match(code); match1.Success; match1 = match1.NextMatch())

{
NewsTitle = match1.Groups["title"].ToString();
}
return NewsTitle;

}
public
ArrayList SniffwebCodeReturnList(
string
code,
string
wordsBegin,
string
wordsEnd)

{
ArrayList urlList = new ArrayList();
//string NewsTitle = "";
Regex regex1 = new Regex("" + wordsBegin + @"(?<title>[\s\S]+?)" + wordsEnd + "", RegexOptions.Compiled | RegexOptions.IgnoreCase);
for (Match match1 = regex1.Match(code); match1.Success; match1 = match1.NextMatch())

{
urlList.Add(match1.Groups["title"].ToString());
}
return urlList;

}
http://www.softbk.com/news.asp?id=3564 <script type="text/javascript"><!-- google_ad_client = "pub-4334685396432654"; //468x15, 创建于 07-12-5 google_ad_slot = "3722935453"; google_ad_width = 468; google_ad_height = 15; //--></script><script src="http://pagead2.googlesyndication.com/pagead/show_ads.js" type="text/javascript"> </script> 欢迎一起交流
























































