首先申明一个类,定义帖子的特征字符串 using System;using System.Collections.Generic;using System.Text;namespace KofskyTianyaPageViewer...{ /**//** * 天涯论坛帖子的标识信息 */ class TianyaPageIdentifiers ...{ private String _headStartIdentifier; /**//** * 页面头部开始标识符 */ public String headStartIdentifier ...{ get ...{ return _headStartIdentifier; } set ...{ _headStartIdentifier = value; } } private String _headEndIdentifier; /**//** * 页面头部结束标识符 */ public String headEndIdentifier ...{ get ...{ return _headEndIdentifier; } set ...{ _headEndIdentifier = value; } } private String _titleStartIdentifier; /**//** * 标题 开始 标识 */ public String titleStartIdentifier ...{ get ...{ return _titleStartIdentifier; } set ...{ _titleStartIdentifier = value; } } private String _titleEndIdentifier; /**//** * 标题 结束 标识 */ public String titleEndIdentifier ...{ get ...{ return _titleEndIdentifier; } set ...{ _titleEndIdentifier = value; } } private String _topicContentStartIdentifier; /**//** * 主贴 内容开始 标识 */ public String topicContentStartIdentifier ...{ get ...{ return _topicContentStartIdentifier; } set ...{ _topicContentStartIdentifier = value; } } private String _topicContentEndIdentifier; /**//** * 主贴 内容结束 标识 */ public String topicContentEndIdentifier ...{ get ...{ return _topicContentEndIdentifier; } set ...{ _topicContentEndIdentifier = value; } } private String _topicAuthorStartIdentifier; /**//** * 楼主 开始 标识 */ public String topicAuthorStartIdentifier ...{ get ...{ return _topicAuthorStartIdentifier; } set ...{ _topicAuthorStartIdentifier = value; } } private String _topicAuthorEndIdentifier; /**//** * 楼主 结束 标识 */ public String topicAuthorEndIdentifier ...{ get ...{ return _topicAuthorEndIdentifier; } set ...{ _topicAuthorEndIdentifier = value; } } private String _topicSubmitTimeStartIdentifier; /**//** * 主贴 提交时间 开始 标识 */ public String topicSubmitTimeStartIdentifier ...{ get ...{ return _topicSubmitTimeStartIdentifier; } set ...{ _topicSubmitTimeStartIdentifier = value; } } private String _topicSubmitTimeEndIdentifier; /**//** * 主贴 提交时间 结束 标识 */ public String topicSubmitTimeEndIdentifier ...{ get ...{ return _topicSubmitTimeEndIdentifier; } set ...{ _topicSubmitTimeEndIdentifier = value; } } private String _remarkStartIdentifer; /**//** * 回帖开始标识 */ public String remarkStartIdentifer ...{ get ...{ return _remarkStartIdentifer; } set ...{ _remarkStartIdentifer = value; } } private String _remarkEndIdentifer; /**//** * 回帖结束标识 */ public String remarkEndIdentifer ...{ get ...{ return _remarkEndIdentifer; } set ...{ _remarkEndIdentifer = value; } } private String _remarkTimeStartIdentifer; /**//** * 回帖 回复时间 开始 标识 */ public String remarkTimeStartIdentifer ...{ get ...{ return _remarkTimeStartIdentifer; } set ...{ _remarkTimeStartIdentifer = value; } } private String _remarkTimeEndIdentifer; /**//** * 回帖 回复时间 结束 标识 */ public String remarkTimeEndIdentifer ...{ get ...{ return _remarkTimeEndIdentifer; } set ...{ _remarkTimeEndIdentifer = value; } } private String _remarkAuthorStartIdentifer; /**//** * 回帖 作者 开始 标识 */ public String remarkAuthorStartIdentifer ...{ get ...{ return _remarkAuthorStartIdentifer; } set ...{ _remarkAuthorStartIdentifer = value; } } private String _remarkAuthorEndIdentifer; /**//** * 回帖 作者 结束 标识 */ public String remarkAuthorEndIdentifer ...{ get ...{ return _remarkAuthorEndIdentifer; } set ...{ _remarkAuthorEndIdentifer = value; } } private String _remarkContentStartIdentifer; // 回帖 内容 开始 标识 public String remarkContentStartIdentifer ...{ get ...{ return _remarkContentStartIdentifer; } set ...{ _remarkContentStartIdentifer = value; } } private String _pageLinkInfoStartIdentifer; /**//** * 分页链接开始的标识符号 * 若不存在该字符串,则认为该贴无分页 * 当前标志:"<table><tr><td>分页链接:" */ public String pageLinkInfoStartIdentifer ...{ get ...{ return _pageLinkInfoStartIdentifer; } set ...{ _pageLinkInfoStartIdentifer = value; } } private String _pageLinkInfoEndIdentifer; /**//** * 分页链接结束的标识符号 */ public String pageLinkInfoEndIdentifer ...{ get ...{ return _pageLinkInfoEndIdentifer; } set ...{ _pageLinkInfoEndIdentifer = value; } } private String _pageLinkStartIdentifer; /**//** * 分页链接中 每页链接开始标识符 * 如:href= */ public String pageLinkStartIdentifer ...{ get ...{ return _pageLinkStartIdentifer; } set ...{ _pageLinkStartIdentifer = value; } } private String _pageLinkEndIdentifer; /**//** * 分页链接中 每页链接结束标识符 */ public String pageLinkEndIdentifer ...{ get ...{ return _pageLinkEndIdentifer; } set ...{ _pageLinkEndIdentifer = value; } } private String _pageIndexStartIdentifer; /**//** * 分页链接中 * 页号 开始标识符 , 比如 [1] 中的[ */ public String pageIndexStartIdentifer ...{ get ...{ return _pageIndexStartIdentifer; } set ...{ _pageIndexStartIdentifer = value; } } private String _pageIndexEndIdentifer; /**//** * 分页链接中 * 页号 结束标识符 , 比如 [1] 中的] */ public String pageIndexEndIdentifer ...{ get ...{ return _pageIndexEndIdentifer; } set ...{ _pageIndexEndIdentifer = value; } } private String _firstPageIdentifer; /**//** * 分页链接中 首页 标识符 * 若存在,则直接跳转找到首页 */ public String firstPageIdentifer ...{ get ...{ return _firstPageIdentifer; } set ...{ _firstPageIdentifer = value; } } private String _currentPageStartIdentifer; /**//** * 分页链接中 当前页面 开始标识符 */ public String currentPageStartIdentifer ...{ get ...{ return _currentPageStartIdentifer; } set ...{ _currentPageStartIdentifer = value; } } private String _currentPageEndIdentifer; /**//** * 分页链接中 当前页面 结束标识符 */ public String currentPageEndIdentifer ...{ get ...{ return _currentPageEndIdentifer; } set ...{ _currentPageEndIdentifer = value; } } }} 人工分析天涯帖子,里面的特征字符串为: /**//** * 获得天涯论坛帖子的标识信息 * * 后期改为由XML配置文件获取 * TODO */ private static void initTianyaIdentifiersInfo(ref TianyaPageIdentifiers identifers) ...{ identifers.titleStartIdentifier = "<TITLE>"; identifers.titleEndIdentifier = "</TITLE>"; identifers.headStartIdentifier = "<HEAD>"; identifers.headEndIdentifier = "</HEAD>"; identifers.pageLinkInfoStartIdentifer = "<table><tr><td>分页链接:"; identifers.pageLinkInfoEndIdentifer = "</td></tr></table>"; identifers.pageLinkStartIdentifer = "href="; identifers.pageLinkEndIdentifer = "><font color=blue>"; identifers.firstPageIdentifer = "首页"; identifers.pageIndexStartIdentifer = "<font color=blue>["; identifers.pageIndexEndIdentifer = "]</font>"; identifers.currentPageStartIdentifer = "<font color=black>["; identifers.currentPageEndIdentifer = "]</font>"; identifers.topicAuthorStartIdentifier = "/browse/Listwriter.asp?vwriter="; identifers.topicAuthorEndIdentifier = "&idwriter=0&key=0"; identifers.topicContentStartIdentifier = "<DIV class=content "; identifers.topicContentEndIdentifier = "</DIV></div><center>"; identifers.topicSubmitTimeStartIdentifier = "提交日期:"; identifers.topicSubmitTimeEndIdentifier = "</font>"; identifers.remarkStartIdentifer = "<TABLE cellspacing=0 border=0"; identifers.remarkEndIdentifer = "<TABLE cellspacing=0 border=0"; identifers.remarkAuthorStartIdentifer = "/browse/Listwriter.asp?vwriter="; identifers.remarkAuthorEndIdentifer = "&idwriter=0&key=0"; identifers.remarkContentStartIdentifer = "</TD></TR></table>"; identifers.remarkTimeStartIdentifer = "回复日期:"; identifers.remarkTimeEndIdentifer = "</font>"; } 分析页面的标题为: /**//** * 分析 标题 */ private static void parseTitleAndHeadInfo(ref TianyaPage currentTianyaPage, ref TianyaPageIdentifiers identifers, ref String pageContentStr) ...{ int titleStartPos = pageContentStr.IndexOf(identifers.titleStartIdentifier); if (titleStartPos < 0) ...{ throw new TianyaPageTitleParseException(); } titleStartPos += identifers.titleStartIdentifier.Length; int titleEndPos = pageContentStr.IndexOf(identifers.titleEndIdentifier, titleStartPos); if (titleEndPos > titleStartPos) ...{ String title = pageContentStr.Substring(titleStartPos, titleEndPos - titleStartPos); currentTianyaPage.title = title; } else ...{ throw new TianyaPageTitleParseException(); }} 分析页面的其他内容,比如回复时间,回复作者等,都是类似过程。