asp.net 垂直搜索引擎的实现(2) --定时更新索引文件-优快云博客

本文介绍如何在ASP.NET应用中实现定时任务，并使用Lucene.NET进行索引更新。通过Global.asax文件中的Application_Start事件触发定时器，每周执行一次索引更新操作。

首先在Global.asax 中编写如下代码：实现一个定时程序。

protected void Application_Start(object sender, EventArgs e) { // 在应用程序启动时运行的代码 System.Timers.Timer timer = new System.Timers.Timer(); timer.Interval = 604800;//一周 timer.AutoReset = true; timer.Enabled = true; timer.Elapsed += new System.Timers.ElapsedEventHandler(doJob); } protected void Application_End(object sender, EventArgs e) { } //在24:10:10秒开始执行程序 void doJob(object source, System.Timers.ElapsedEventArgs e) { // 得到 hour minute second　如果等于某个值就开始执行某个程序。 int intHour = e.SignalTime.Hour; int intMinute = e.SignalTime.Minute; int intSecond = e.SignalTime.Second; // 定制时间；比如在10：30 ：00 的时候执行某个函数 int iHour = 24; int iMinute = 10; int iSecond = 10; // 设置　每天的24：1０：10开始执行程序 if (intHour == iHour && intMinute == iMinute && intSecond == iSecond) { Jebsearch.Code.CreatIndex createindex = new Jebsearch.Code.CreatIndex(); createindex.SaveCfs(); } }

写索引生成代码：CreatIndex.cs

using System; using System.Data; using System.Configuration; using System.Web; using System.Web.Security; using System.Web.UI; using System.Web.UI.WebControls; using System.Web.UI.WebControls.WebParts; using System.Web.UI.HtmlControls; using System.Xml; using Lucene.Net.Analysis; using WawaSoft.Search.Common.Analyzer; using Lucene.Net.Index; using Lucene.Net.Analysis.KTDictSeg; using Lucene.Net.Documents; using System.Text.RegularExpressions; namespace Jebsearch.Code { /// <summary> /// 创建索引类 /// </summary> public class CreatIndex { private DateTime _dotime;//最后执行时间 private int _dointerval;//执行间隔 private int _totoPage;//总页数 public int TotoPage { get { return _totoPage; } set { _totoPage = value; } } public DateTime DoTime { get { return _dotime; } set { _dotime = value; } } public int DoInterval { get { return _dointerval; } set { _dointerval = value; } } private static string _strXmlFile; private static XmlDocument _objXmlDoc = new XmlDocument(); /// <summary> /// 加载文档 /// </summary> /// <param name="XmlFile"></param> public void Load(string _XmlFile) { try { _objXmlDoc = new XmlDocument(); _objXmlDoc.Load(_XmlFile); } catch (System.Exception ex) { throw ex; } _strXmlFile = _XmlFile; } /// <summary> /// 更新节点內容。 /// </summary> /// <param name="XmlPathNode"></param> /// <param name="Content"></param> public void Replace(string _XmlPathNode, string _Content) { _objXmlDoc.SelectSingleNode(_XmlPathNode).InnerText = _Content; } /// <summary> /// 获得节点內容。 /// </summary> /// <param name="_XmlPathNode">如:"Channels/Channel[ID=/"1/"]/Content"</param> public string GetText(string _XmlPathNode) { XmlNode _Node = _objXmlDoc.SelectSingleNode(_XmlPathNode); if (_Node != null) return _Node.InnerText; else return ""; } /// <summary> /// 保存文档。 /// </summary> public void Save() { try { _objXmlDoc.Save(_strXmlFile); } catch (System.Exception ex) { throw ex; } _objXmlDoc = null; } public CreatIndex() { _dointerval = 7;//每七天执行一次 _dotime = DateTime.Now; } /// <summary> /// 保存和修改最后一次执行时间 /// </summary> public void SvaeDoTime() { _strXmlFile = System.Web.HttpContext.Current.Server.MapPath("~/_data/dotime.xml"); this.Load(_strXmlFile); this.Replace("//search//adobj//dotime",this.DoTime.ToString()); this.Save(); } /// <summary> /// 获取数据 /// </summary> /// <returns></returns> public DataTable GetData(int index) { DataTable dt = null; _strXmlFile = System.Web.HttpContext.Current.Server.MapPath("~/_Data/dotime.xml"); this.Load(_strXmlFile); DateTime startTime = DateTime.Parse(this.GetText("//search//adobj//dotime").ToString());//开始时间 DateTime endTime = DateTime.Now;//数据结束时间 //数据库连接字符串(web.config来配置)，可以动态更改connectionString支持多数据库. string connectionString = ConfigurationManager.ConnectionStrings["Collect"].ToString();// Jebsearch.DBUtility.SqlDbOperHandler doh = new Jebsearch.DBUtility.SqlDbOperHandler(new System.Data.SqlClient.SqlConnection(connectionString)); doh.ConditionExpress = "Res_Up_Time between @starttime and @endtime "; doh.AddConditionParameter("@starttime", startTime); doh.AddConditionParameter("@endtime", endTime); int recount=0; int pagecount = 0; int pagesize=500; string sql = doh.GetPageSql("Res_Id,Res_Title,Res_Edition,Res_St_Class,Res_Type,Res_Up_User,Res_IsTop,Res_IsHot,Res_IsBest,Res_Size,Res_HtmlPath,Res_Content,Res_Type,Res_Subject,Res_Point,Res_Up_Time,Res_Format", "Resource", "Res_Id", "order by Res_Id desc", index, pagesize, out recount, out pagecount); doh.SqlCmd = sql; dt=doh.GetDataTable(); if (doh != null) { doh.Dispose(); } this.TotoPage = pagecount; return dt; } /// <summary> /// 保存到CFS中 /// </summary> public void SaveCfs() { DataTable dt = GetData(1); gindexs(1, dt); for (int i = 2; i < this.TotoPage; i++) { dt = GetData(i); gindexs(1,dt); } //保存最后一次执行时间 SvaeDoTime(); } public void gindexs(int itype, DataTable dt) { string dictPath = System.Web.HttpContext.Current.Server.MapPath("~/_Data") + @"/"; //词库路径 string _indexDirectory = System.Web.HttpContext.Current.Server.MapPath("~/_index") + @"/"; //索引路径; Analyzer KTDanalyzer = new KTDictSegAnalyzer(dictPath); //按域分析包装器,邮件标题、正文等用中文分析器分析 WawaSimpleAnalyzer simpleAnalyzer = new WawaSimpleAnalyzer(); //按分隔符语汇单元化的分析器 PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(KTDanalyzer); //Lucene.Net.Analysis.Standard.StandardAnalyzer standardAnalyzer = new Lucene.Net.Analysis.Standard.StandardAnalyzer(); wrapper.AddAnalyzer("id", simpleAnalyzer); wrapper.AddAnalyzer("sourceurl", simpleAnalyzer); wrapper.AddAnalyzer("time", simpleAnalyzer); bool isre = !IndexReader.IndexExists(_indexDirectory); IndexWriter _writer = new IndexWriter(_indexDirectory, wrapper, isre); //创建IndexWriter _writer.SetUseCompoundFile(true); //显式设置索引为复合索引 _writer.SetMaxFieldLength(int.MaxValue); //设置域最大长度为最大值 _writer.SetMergeFactor(dt.Rows.Count + 100); //设置每100个段合并成一个大段 _writer.SetMaxMergeDocs(10000); //设置一个段的最大文档数 _writer.SetMaxBufferedDocs(1000); //设置在把索引写入磁盘前内存里文档的缓存个数 IndexReader reader = null; bool needre = itype == 1; reader = IndexReader.Open(_indexDirectory); foreach (DataRow dr in dt.Rows) { string body = parseHtml(dr["Res_Content"].ToString()); string title = parseHtml(dr["Res_Title"].ToString()); if (title.Length > 2 && body.Length > 2) { if (needre) { Term term = new Term("id", dr["Res_Id"].ToString()); reader.DeleteDocuments(term); } Document document = new Document(); document.Add(new Field("id", dr["Res_Id"].ToString() ?? "", Field.Store.YES, Field.Index.UN_TOKENIZED)); document.Add(new Field("title", title, Field.Store.YES, Field.Index.TOKENIZED)); document.Add(new Field("content", body, Field.Store.YES, Field.Index.TOKENIZED)); document.Add(new Field("sourceurl", dr["Res_HtmlPath"].ToString() ?? "", Field.Store.YES, Field.Index.UN_TOKENIZED)); document.Add(new Field("time", DateTime.Parse(dr["Res_Up_Time"].ToString()).ToShortDateString() ?? "", Field.Store.YES, Field.Index.UN_TOKENIZED)); document.Add(new Field("subject", dr["Res_Subject"].ToString() ?? "0", Field.Store.YES, Field.Index.UN_TOKENIZED)); document.Add(new Field("edition", dr["Res_Edition"].ToString() ?? "1", Field.Store.YES, Field.Index.UN_TOKENIZED)); document.Add(new Field("class", dr["Res_St_Class"].ToString() ?? "0", Field.Store.YES, Field.Index.UN_TOKENIZED)); document.Add(new Field("type", dr["Res_Type"].ToString() ?? "0", Field.Store.YES, Field.Index.UN_TOKENIZED)); document.Add(new Field("point", dr["Res_Point"].ToString() ?? "0", Field.Store.YES, Field.Index.UN_TOKENIZED)); document.Add(new Field("host", dr["Res_IsHot"].ToString() ?? "false", Field.Store.YES, Field.Index.UN_TOKENIZED)); document.Add(new Field("best", dr["Res_IsBest"].ToString() ?? "false", Field.Store.YES, Field.Index.UN_TOKENIZED)); document.Add(new Field("top", dr["Res_IsTop"].ToString() ?? "false", Field.Store.YES, Field.Index.UN_TOKENIZED)); document.Add(new Field("formt", dr["Res_Format"].ToString() ?? "", Field.Store.YES, Field.Index.UN_TOKENIZED)); document.Add(new Field("size", dr["Res_Size"].ToString() ?? "0KB", Field.Store.YES, Field.Index.UN_TOKENIZED)); document.Add(new Field("author", dr["Res_Up_User"].ToString() ?? "0KB", Field.Store.YES, Field.Index.UN_TOKENIZED)); _writer.AddDocument(document); } } reader.Close(); _writer.Optimize(); _writer.Close(); } #region HTML字符替换 private string parseHtml(string html) { html = Regex.Replace(html, @"<mce:style[^></mce:style><style[^ mce_bogus="1">]*?>.*?</style>", "", RegexOptions.IgnoreCase); html = Regex.Replace(html, @"^<img/s+[^>]*>", "略", RegexOptions.IgnoreCase); html = Regex.Replace(html, @"<mce:style[^></mce:style><style[^ mce_bogus="1">]*?>.*?</style>", "", RegexOptions.IgnoreCase); html = Regex.Replace(html, @"<p.*?>", "", RegexOptions.IgnoreCase); html = Regex.Replace(html, @"<span.*?>", "", RegexOptions.IgnoreCase); html = Regex.Replace(html, @"<b.*?>", "", RegexOptions.IgnoreCase); html = Regex.Replace(html, @"<strong.*?>", "", RegexOptions.IgnoreCase); html = Regex.Replace(html, @"<font.*?>", "", RegexOptions.IgnoreCase); return html.Replace(" ", " "); } public String ConvertRelativePathsToAbsolute(String text, String absoluteUrl) { String value = Regex.Replace(text, "<(.*?)(src)=/"(?!http)(.*?)/"(.*?)>", "<$1$2=/"" + absoluteUrl + "$3/"$4 style='width:500px;height:300px;'>", RegexOptions.IgnoreCase | RegexOptions.Multiline); // Now just make sure that there isn't a // because if // the original relative path started with a / then the // replacement above would create a //. return value.Replace(absoluteUrl + "/", absoluteUrl); } #endregion } }

源代码下载地址：下载

分享到：