Html Agility Pack (HAP) is a HTML parser library that builds a read/ writes DOM and supports plain XPATH or XSLT. And it is for free. It formats Html as the standard Xml format using XpathDocument related class in .net.
It is a .NET code library that allows you to parse “out of the web” HTML files. The object model is very similar to what proposes System.Xml, but for HTML documents (or streams).
There’s no dependency on anything else than .Net’s XPath implementation and no dependency on Internet Explorer’s MSHTML dll or W3C’s HTML tidy or Active X / COM object, or anything like that. There’s also no adherence to XHTML or XML, although you can actually produce XML using the tool.
How to Use:
Ø Download the latest API library from http://htmlagilitypack.codeplex.com/ ;
Ø Add “HAP” as reference in the application;
Ø Add “using HtmlAgilityPack” in code;
Ø Get HtmlDocument instance by calling the method in HAP class library;
Ø Get HtmlNode and the node/ attribute value;
The following classes are frequently used: HtmlDocument, HtmlNodeCollection, HtmlNode and HtmlWeb
Usually, to get HtmlDocument, you can firstly get the web page source, (About how to get page source, there’s another document describing it.), then call HtmlDocument.Load() or HtmlDocument.LoadHtml() and you can also call HtmlWeb.Get() or HtmlWeb.Load() to load the HTML based on Url. After getting HtmlDocument instance, we can use XPath to get HtmlNode.
About XPath, please refer to http://www.w3.org/TR/xpath/
BTW, if the HtmlNode has a “ID”, like “<div id='post_list'>value</div>”, call GetElementbyId() is OK for getting the HtmlNode, then get the value by HtmlNode.InnerText or HtmlNode.Attribute.
Please see the following C# code snippet.
Code snippet:
//get HtmlAgilityPack.HtmlDocument object
HtmlDocument doc = new HtmlDocument();
//load HTML
doc.LoadHtml(pageSource);
//get HtmlNode by ID
HtmlNode navNode = doc.GetElementbyId("post_list");
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net;
using System.Text.RegularExpressions;
using HtmlAgilityPack;
namespace RegexPractice
{
class Program
{
static void Main(string[] args)
{
string pageUrl = "http://top.baidu.com/buzz.php?p=top_keyword";
WebClient wc = new WebClient();
byte[] pageSourceBytes = wc.DownloadData(new Uri(pageUrl));
string pageSource = Encoding.GetEncoding("gb2312").GetString(pageSourceBytes);
//Regex searchKeyRegex = new Regex("<td class=\"key\">.*?target=\"_blank\">(?<keyWord>.*?)</a></td>");
//MatchCollection mc = searchKeyRegex.Matches(pageSource);
//List<string> keyWordList = new List<string>();
//foreach(Match m in mc)
//{
// keyWordList.Add(m.Groups["keyWord"].Value);
//}
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml(pageSource);
HtmlNodeCollection keyNodes = doc.DocumentNode.SelectNodes("//td[@class='key']/a[@ target='_blank']");
List<string> keyWords = new List<string>();
foreach (HtmlNode keyNode in keyNodes)
{
keyWords.Add(keyNode.InnerText);
}
//HtmlDocument doc = new HtmlDocument();
//doc.LoadHtml(pageSource);
//HtmlNode ulNode = doc.DocumentNode.SelectSingleNode("//ul[@class='hotnews']");
//HtmlNodeCollection liNodes = ulNode.SelectNodes("li");
//List<string> topList = new List<string>();
//List<string> subList = new List<string>();
//foreach (HtmlNode liNode in liNodes)
//{
// if (liNode.Attributes["class"] != null && liNode.Attributes["class"].Value == "top")
// {
// topList.Add(liNode.InnerText);
// }
// else
// {
// if (subList.Count < topList.Count)
// {
// subList.Add(liNode.InnerText);
// }
// else
// {
// subList[subList.Count - 1] = subList[subList.Count - 1] + liNode.InnerText;
// }
// }
//}
return;
//Regex hotTopNewsRegex = new Regex("class=\"a3\".*?>(?<hotNews>.*)<");
//MatchCollection topMc = hotTopNewsRegex.Matches(pageSource);
//List<string> hotNewsList = new List<string>();
//foreach (Match m in topMc)
//{
// hotNewsList.Add(m.Groups["hotNews"].Value);
//}
//Regex replaceRegex = new Regex("</?font.*?>");
//for (int i = 0; i < hotNewsList.Count;i++ )
//{
// hotNewsList[i] = replaceRegex.Replace(hotNewsList[i], "");
//}
//Regex hotSubNewsRegex = new Regex("(?s)class=\"top\"(?<subNews>.*?)class=\"top\"");
//MatchCollection subMc = hotSubNewsRegex.Matches(pageSource);
//int temp = subMc.Count;
//List<string> subNewsList = new List<string>();
//foreach (Match m in subMc)
//{
// subNewsList.Add(m.Groups["subNews"].Value);
//}
}
}
}
Another code snippet
Download specified number of pictures from “ http://browse.deviantart.com/customization/wallpaper/widescreen/?order=15” and save to local files.
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net;
using System.Text.RegularExpressions;
using HtmlAgilityPack;
using System.IO;
namespace RegexPractice
{
public class Util
{
//Get byte[] format page source
public static byte[] GetPageSourceBytes(string url)
{
WebClient wc = new WebClient();
byte[] pageSourceBytes = wc.DownloadData(new Uri(url));
return pageSourceBytes;
}
//get string format page source
public static string GetPageSource(string url, string encodingType)
{
byte[] pageSourceBytes = GetPageSourceBytes(url);
string pageSource = Encoding.GetEncoding(encodingType).GetString(pageSourceBytes);
return pageSource;
}
//Save image to local file
public static void SavaImagesToFile(string url,string dirPath,string fileName)
{
if(!Directory.Exists(dirPath))
{
Directory.CreateDirectory(dirPath);
}
WebClient wc = new WebClient();
wc.DownloadFile(url, Path.Combine(dirPath, fileName + Guid.NewGuid().ToString()));
}
}
public class ImageInfo
{
public string Title;
public string SrcPath;
public static List<ImageInfo> GetImageInfoList(string url)
{
// URL:http://browse.deviantart.com/customization/wallpaper/widescreen/?order=15
string pageSource = Util.GetPageSource(url, "gb2312");
HtmlDocument doc = new HtmlDocument();
doc.LoadHtml (pageSource);
HtmlNodeCollection spanNodeList = doc.DocumentNode.SelectNodes("//span[@class='tt-w']");
List<ImageInfo> imageList = new List<ImageInfo>();
for (int i = 0; i < 24; i++)
{
HtmlNode curSpanNode = spanNodeList[i];
HtmlNode curImageNode = curSpanNode.SelectSingleNode("//img");
HtmlNode curLinkNode = curSpanNode.SelectSingleNode("a");
ImageInfo image = new ImageInfo();
image.Title = curLinkNode.InnerText;
image.SrcPath = curImageNode.Attributes["src"].Value;
imageList.Add(image);
}
return imageList;
}
class Program
{
static void Main(string[] args)
{
int sumCount = 100;
string baseUrl = "http://browse.deviantart.com/customization/wallpaper/widescreen/?order=15";
List<ImageInfo> imageInfoList = new List<ImageInfo>();
imageInfoList = GetSumImageInfoList(sumCount, baseUrl);
foreach (ImageInfo imageInfo in imageInfoList)
{
Util.SavaImagesToFile(imageInfo.SrcPath, @"c:\Images", GetValidFilename(imageInfo.Title));
}
return;
}
static string GetValidFilename(string filename)
{
foreach (char c in Path.GetInvalidFileNameChars())
{
filename = filename.Replace(c, '_');
}
return filename;
}
static List<ImageInfo> GetSumImageInfoList(int sum, string baseUri)
{
List<ImageInfo> resultList = new List<ImageInfo>();
int c = (sum - 1) / 24 + 1;
for (int i = 0; i < c; i++)
{
int offset = i * 24;
string url = string.Format("{0}&offset={1}", baseUri, offset);
List<ImageInfo> curResultList = ImageInfo.GetImageInfoList(url);
foreach (ImageInfo imageInfo in curResultList)
{
if (resultList.Count < sum)
{
resultList.Add(imageInfo);
}
}
}
return resultList;
}
}
}