GetDocLinks.cs代码:
using System;
using System.Collections;
namespace HtmlAgilityPack.Samples
{
class GetDocLinks
{
[STAThread]
static void Main(string[] args)
{
HtmlWeb hw = new HtmlWeb();
string url = @"http://www.microsoft.com";
HtmlDocument doc = hw.Load(url);
doc.Save("mshome.htm");
DocumentWithLinks nwl = new DocumentWithLinks(doc);
Console.WriteLine("链接 urls:");
for(int i=0;i<nwl.Links.Count;i++)
{
Console.WriteLine(nwl.Links[i]);
}
Console.WriteLine("引用 urls:");
for(int i=0;i<nwl.References.Count;i++)
{
Console.WriteLine(nwl.References[i]);
}
Console.ReadKey();
}
}
public class DocumentWithLinks
{
private ArrayList _links;
private ArrayList _references;
private HtmlDocument _doc;
public DocumentWithLinks(HtmlDocument doc)
{
if (doc == null)
{
throw new ArgumentNullException("doc");
}
_doc = doc;
GetLinks();
GetReferences();
}
private void GetLinks()
{
_links = new ArrayList();
HtmlNodeCollection atts = _doc.DocumentNode.SelectNodes("//*[@background or @lowsrc or @src or @href]");
if (atts == null)
return;
foreach(HtmlNode n in atts)
{
ParseLink(n, "background");
ParseLink(n, "href");
ParseLink(n, "src");
ParseLink(n, "lowsrc");
}
}
private void GetReferences()
{
_references = new ArrayList();
HtmlNodeCollection hrefs = _doc.DocumentNode.SelectNodes("//a[@href]");
if (hrefs == null)
return;
foreach(HtmlNode href in hrefs)
{
_references.Add(href.Attributes["href"].Value);
}
}
private void ParseLink(HtmlNode node, string name)
{
HtmlAttribute att = node.Attributes[name];
if (att == null)
return;
if ((name == "href") && (node.Name != "link"))
return;
_links.Add(att.Value);
}
public ArrayList Links
{
get
{
return _links;
}
}
public ArrayList References
{
get
{
return _references;
}
}
}
}
运行结果如图:

