用C#提取网页中的超链接

      using System;
  
using System.Xml;
  
using System.Text;   using System.Net;
  
using System.IO;
  
using System.Collections;
  
using System.Text.RegularExpressions;
  
public class App
  {
  
public static void Main()
   { 
   
string strCode;
   ArrayList alLinks;

   Console.Write(
"请输入一个网页地址:");
   
string strURL = Console.ReadLine();
   
if(strURL.Substring(0,7) != @"http://")
   {
    strURL
= @"http://" + strURL;
   }


  Console.WriteLine(
"正在获取页面代码,请稍侯...");
   strCode
= GetPageSource(strURL);


  Console.WriteLine(
"正在提取超链接,请稍侯...");
   alLinks
= GetHyperLinks(strCode);


  Console.WriteLine(
"正在写入文件,请稍侯...");
   WriteToXml(strURL,alLinks);
   }


   
// 获取指定网页的HTML代码
   static string GetPageSource(string URL)
   {
   Uri uri
=new Uri(URL);


  HttpWebRequest hwReq
= (HttpWebRequest)WebRequest.Create(uri);
   HttpWebResponse hwRes
= (HttpWebResponse)hwReq.GetResponse();


  hwReq.Method
= "Get";


  hwReq.KeepAlive
= false;


  StreamReader reader
= new StreamReader(hwRes.GetResponseStream(),System.Text.Encoding.GetEncoding("GB2312"));


  
return reader.ReadToEnd();
   }


// 提取HTML代码中的网址
   static ArrayList GetHyperLinks(string htmlCode)
   {
   ArrayList al
= new ArrayList();


  
string strRegex = @"http://([/w-]+/.)+[/w-]+(/[/w- ./?%&=]*)?";


  Regex r
= new Regex(strRegex,RegexOptions.IgnoreCase);
   MatchCollection m
= r.Matches(htmlCode);


  
for(int i=0; i<=m.Count-1; i++)
   {
   
bool rep = false;
   
string strNew = m[i].ToString();


  
// 过滤重复的URL
    foreach(string str in al)
    {
    
if(strNew==str)
    {
     rep
=true;
    
break;
    } 
    }


  
if(!rep) al.Add(strNew);
   }


  al.Sort();


  
return al;
   }


// 把网址写入xml文件
   static void WriteToXml(string strURL, ArrayList alHyperLinks)
   {
   XmlTextWriter writer
= new XmlTextWriter("HyperLinks.xml",Encoding.UTF8);

   writer.Formatting
= Formatting.Indented;
   writer.WriteStartDocument(
false);
   writer.WriteDocType(
"HyperLinks", null, "urls.dtd", null);
   writer.WriteComment(
"提取自" + strURL + "的超链接");
   writer.WriteStartElement(
"HyperLinks");
   writer.WriteStartElement(
"HyperLinks", null);
   writer.WriteAttributeString(
"DateTime",DateTime.Now.ToString());


   
foreach(string str in alHyperLinks)
   {
   
string title = GetDomain(str);
   
string body = str;
    writer.WriteElementString(title,
null,body);
   }


  writer.WriteEndElement();
   writer.WriteEndElement();


  writer.Flush();
   writer.Close();
   }


// 获取网址的域名后缀
   static string GetDomain(string strURL)
   {
   
string retVal;


  
string strRegex = @"(/.com/|/.net/|/.cn/|/.org/|/.gov/)";


  Regex r
= new Regex(strRegex,RegexOptions.IgnoreCase);
   Match m
= r.Match(strURL);
   retVal
= m.ToString();


  strRegex
= @"/.|/$";
   retVal
= Regex.Replace(retVal, strRegex, "").ToString();


  
if(retVal == "")
    retVal
= "other";


  
return retVal;
   }
  }

评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值