java中文分词算法

本文介绍了一种基于Lucene的高效搜索方案,通过对比LIKE关键字的局限性,阐述了分词技术的重要性,并提供了详细的Java实现代码。文章还介绍了如何进行高亮显示、前缀查询等高级搜索操作。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

我想只要是学过数据库的孩纸,不管是mysql,还是sqlsever,一提到查找,本能的想到的便是like关键字,其实去转盘网分类模式)之前也是采用这种算法,但我可以告诉大家一个很不幸的事情,like匹配其实会浪费大量的有用资源,原因这里不说了请自己想一想,我们还是直接摆事实验证。

   现在用去转盘网搜:hello 找个单词,如下:

   http://www.quzhuanpan.com/source/search.action?q=hello&currentPage=1

翻页你会发现只要是包含hello的单词都找到了,但是如果你用like的话是不会有这个效果的,不信让我们再看一下,还好他来说电影网的分词算法我还没来得及修改,还可以看到现象:

   http://www.talaishuo.com/searchResult.do?searchFileName=hello

你会发现只有开始包含hello这个字段的搜索串才能得到匹配,这就问题来了,数据库中大量的资源岂不是白白浪费了,不过没事,伟大的人类还是很聪明的,发明了分词,分词的原理我就不讲了,请自己百度吧,还是直接上代码,提示,这里需要四个jar包作为工具,我先上传的去转盘,想要做分词的请先下载:

   分词包下载地址1

   分词包下载地址2

另外附上2篇有用的博客:百度网盘爬虫 百度图片爬虫

[java]  view plain  copy
  1. package com.tray.indexData;  
  2. import java.io.File;  
  3. import java.io.IOException;  
  4. import java.io.StringReader;  
  5. import java.math.BigInteger;  
  6. import java.util.ArrayList;  
  7. import java.util.HashMap;  
  8. import java.util.List;  
  9. import java.util.Map;  
  10.    
  11. import org.apache.lucene.analysis.Analyzer;  
  12. import org.apache.lucene.analysis.TokenStream;  
  13. import org.apache.lucene.document.Document;  
  14. import org.apache.lucene.document.Fieldable;  
  15. import org.apache.lucene.index.CorruptIndexException;  
  16. import org.apache.lucene.index.IndexReader;  
  17. import org.apache.lucene.index.IndexWriter;  
  18. import org.apache.lucene.index.IndexWriterConfig;  
  19. import org.apache.lucene.index.IndexWriterConfig.OpenMode;  
  20. import org.apache.lucene.index.Term;  
  21. import org.apache.lucene.queryParser.MultiFieldQueryParser;  
  22. import org.apache.lucene.queryParser.QueryParser;  
  23. import org.apache.lucene.search.IndexSearcher;  
  24. import org.apache.lucene.search.PrefixQuery;  
  25. import org.apache.lucene.search.Query;  
  26. import org.apache.lucene.search.ScoreDoc;  
  27. import org.apache.lucene.search.TermQuery;  
  28. import org.apache.lucene.search.TopDocs;  
  29. import org.apache.lucene.search.TopScoreDocCollector;  
  30. import org.apache.lucene.search.WildcardQuery;  
  31. import org.apache.lucene.search.highlight.Highlighter;  
  32. import org.apache.lucene.search.highlight.QueryScorer;  
  33. import org.apache.lucene.search.highlight.SimpleHTMLFormatter;  
  34. import org.apache.lucene.store.Directory;  
  35. import org.apache.lucene.store.FSDirectory;  
  36. import org.apache.lucene.util.Version;  
  37. import org.wltea.analyzer.lucene.IKAnalyzer;  
  38.    
  39. import com.tray.bean.SerachResult;  
  40. import com.tray.common.tools.DateFormater;  
  41.    
  42. public class LuceneSearch {  
  43.        
  44.     private static String DISC_URL = "/home/indexData/data";  
  45.        
  46.     static {  
  47.         String os = System.getProperty("os.name");    
  48.         if(os.toLowerCase().startsWith("win")){    
  49.             DISC_URL = "E:\\indexData\\data";   
  50.         }  
  51.         else{  
  52.             DISC_URL ="/home/indexData/data";  
  53.         }  
  54.     }  
  55.            
  56.     //指定分词器   
  57.     private Analyzer analyzer=new IKAnalyzer();   
  58.     private static Directory directory;  
  59.     //配置  
  60.     private static IndexWriterConfig iwConfig;  
  61.     //配置IndexWriter  
  62.     private static IndexWriter writer;    
  63.     private static File indexFile = null;    
  64.        
  65.     private static Version version = Version.LUCENE_36;  
  66.        
  67.     private final int PAPGESIZE=10;  
  68.    
  69.     /** 
  70.      * 全量索引 
  71.      * @Author haoning 
  72.      */  
  73.     public void init() throws Exception {  
  74.            
  75.         try {  
  76.             indexFile = new File(DISC_URL);  
  77.             if (!indexFile.exists()) {  
  78.                 indexFile.mkdir();  
  79.             }  
  80.             directory=FSDirectory.open(indexFile);    
  81.             //配置IndexWriterConfig    
  82.             iwConfig = new IndexWriterConfig(version,analyzer);    
  83.             iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);    
  84.                 //创建写索引对象    
  85.             writer = new IndexWriter(directory,iwConfig);     
  86.         } catch (Exception e) {  
  87.         }  
  88.     }  
  89.        
  90.     public void closeWriter(){  
  91.         try {  
  92.             writer.close();  
  93.         } catch (CorruptIndexException e) {  
  94.             e.printStackTrace();  
  95.         } catch (IOException e) {  
  96.             e.printStackTrace();  
  97.         }  
  98.     }  
  99.        
  100.     public void commit(){  
  101.            
  102.         try {  
  103.             writer.commit();  
  104.         } catch (CorruptIndexException e) {  
  105.             e.printStackTrace();  
  106.         } catch (IOException e) {  
  107.             e.printStackTrace();  
  108.         }  
  109.     }  
  110.        
  111.     /** 
  112.      * 一个一个索引 
  113.      * @Author haoning 
  114.      */  
  115.     public void singleIndex(Document doc) throws Exception {  
  116.         writer.addDocument(doc);  
  117.     }  
  118.        
  119.     /** 
  120.      * 一个跟新 
  121.      * @Author haoning 
  122.      */  
  123.     public void singleUpdate(Document doc) throws Exception {  
  124.         Term term = new Term("url", doc.get("url"));  
  125.         writer.updateDocument(term,doc);  
  126.     }  
  127.        
  128.     /** 
  129.      * 全量索引 
  130.      * @Author haoning 
  131.      */  
  132.     public void fullIndex(Document[] documentes) throws Exception {  
  133.            
  134.         writer.deleteAll();  
  135.         for (Document document : documentes) {  
  136.             writer.addDocument(document);  
  137.         }  
  138.         writer.commit();  
  139.     }  
  140.        
  141.     /** 
  142.      * 根据id删除索引 
  143.      * @Author haoning 
  144.      */  
  145.     public void deleteIndex(Document document)throws Exception{  
  146.         Term term = new Term("url", document.get("url"));//url才是唯一标志  
  147.         writer.deleteDocuments(term);  
  148.         writer.commit();  
  149.     }  
  150.        
  151.     /** 
  152.      * 根据id增量索引 
  153.      * @Author haoning 
  154.      */  
  155.     public void updateIndex(Document[] documentes) throws Exception{  
  156.         for (Document document : documentes) {  
  157.             Term term = new Term("url", document.get("url"));  
  158.             writer.updateDocument(term, document);  
  159.         }  
  160.         writer.commit();  
  161.     }  
  162.        
  163.     /** 
  164.      * 直接查询 
  165.      * @Author haoning 
  166.      */  
  167.     public void simpleSearch(String filedStr,String queryStr,int page, int pageSize) throws Exception{  
  168.         File indexDir = new File(DISC_URL);    
  169.         //索引目录    
  170.         Directory dir=FSDirectory.open(indexDir);    
  171.         //根据索引目录创建读索引对象    
  172.         IndexReader reader = IndexReader.open(dir);    
  173.         //搜索对象创建    
  174.         IndexSearcher searcher = new IndexSearcher(reader);  
  175.         TopScoreDocCollector topCollector = TopScoreDocCollector.create(searcher.maxDoc(), false);  
  176.            
  177.         Term term = new Term(filedStr, queryStr);  
  178.         Query query = new TermQuery(term);  
  179.         searcher.search(query, topCollector);  
  180.         ScoreDoc[] docs = topCollector.topDocs((page-1)*pageSize, pageSize).scoreDocs;  
  181.            
  182.         printScoreDoc(docs, searcher);  
  183.     }  
  184.        
  185.     /** 
  186.      * 高亮查询 
  187.      * @Author haoning 
  188.      */  
  189.     public Map<String, Object> highLightSearch(String filed,String keyWord,int curpage, int pageSize) throws Exception{  
  190.         List<SerachResult> list=new ArrayList<SerachResult>();  
  191.         Map<String,Object> map = new HashMap<String,Object>();  
  192.         if (curpage <= 0) {  
  193.             curpage = 1;  
  194.         }  
  195.         if (pageSize <= 0 || pageSize>20) {  
  196.              pageSize = PAPGESIZE;  
  197.         }  
  198.         File indexDir = new File(DISC_URL); //索引目录     
  199.         Directory dir=FSDirectory.open(indexDir);//根据索引目录创建读索引对象      
  200.         IndexReader reader = IndexReader.open(dir);//搜索对象创建      
  201.         IndexSearcher searcher = new IndexSearcher(reader);  
  202.            
  203.         int start = (curpage - 1) * pageSize;  
  204.            
  205.         Analyzer analyzer = new IKAnalyzer(true);  
  206.         QueryParser queryParser = new QueryParser(Version.LUCENE_36, filed, analyzer);  
  207.         queryParser.setDefaultOperator(QueryParser.AND_OPERATOR);  
  208.         Query query = queryParser.parse(keyWord);  
  209.            
  210.         int hm = start + pageSize;  
  211.         TopScoreDocCollector res = TopScoreDocCollector.create(hm, false);  
  212.         searcher.search(query, res);  
  213.            
  214.         SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span style='color:red'>""</span>");  
  215.         Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query));  
  216.            
  217.         long amount = res.getTotalHits();  
  218.         //long pages = (rowCount - 1) / pageSize + 1; //计算总页数  
  219.            
  220.         map.put("amount",amount);//总共多少条记录  
  221.            
  222.         TopDocs tds = res.topDocs(start, pageSize);  
  223.         ScoreDoc[] sd = tds.scoreDocs;  
  224.            
  225.         for (int i = 0; i < sd.length; i++) {  
  226.             Document doc = searcher.doc(sd[i].doc);  
  227.             String temp=doc.get("name");  
  228.             //做高亮处理  
  229.             TokenStream ts = analyzer.tokenStream("name"new StringReader(temp));  
  230.                
  231.             SerachResult record=new SerachResult();  
  232.             String name = highlighter.getBestFragment(ts,temp);   
  233.             String skydirverName=doc.get("skydirverName");  
  234.             String username=doc.get("username");  
  235.             String shareTime=doc.get("shareTime");  
  236.             String describ=doc.get("describ");  
  237.             String typeId=doc.get("typeId");  
  238.             String id=doc.get("id");  
  239.             String url=doc.get("url");  
  240.                
  241.             record.setName(name);  
  242.             record.setSkydriverName(skydirverName);  
  243.             record.setUsername(username);  
  244.             record.setShareTime(DateFormater.getFormatDate(shareTime,"yyyy-MM-dd HH:mm:ss"));  
  245.             record.setDescrib(describ);  
  246.             record.setTypeId(Integer.parseInt(typeId));  
  247.             record.setId(new BigInteger(id));  
  248.             record.setUrl(url);  
  249.             list.add(record);  
  250.                
  251.             /*System.out.println("name:"+name); 
  252.             System.out.println("skydirverName:"+skydirverName); 
  253.             System.out.println("username:"+username); 
  254.             System.out.println("shareTime:"+shareTime); 
  255.             System.out.println("describ:"+describ); 
  256.             System.out.println("typeId:"+typeId); 
  257.             System.out.println("id:"+id); 
  258.             System.out.println("url:"+url);*/  
  259.         }  
  260.         map.put("source",list);  
  261.         return map;  
  262.     }  
  263.        
  264.     /** 
  265.      * 根据前缀查询 
  266.      * @Author haoning 
  267.      */  
  268.     public void prefixSearch(String filedStr,String queryStr) throws Exception{  
  269.         File indexDir = new File(DISC_URL);    
  270.         //索引目录    
  271.         Directory dir=FSDirectory.open(indexDir);    
  272.         //根据索引目录创建读索引对象    
  273.         IndexReader reader = IndexReader.open(dir);    
  274.         //搜索对象创建    
  275.         IndexSearcher searcher = new IndexSearcher(reader);  
  276.            
  277.         Term term = new Term(filedStr, queryStr);  
  278.         Query query = new PrefixQuery(term);  
  279.            
  280.         ScoreDoc[] docs = searcher.search(query, 3).scoreDocs;  
  281.         printScoreDoc(docs, searcher);  
  282.     }  
  283.        
  284.     /** 
  285.      * 通配符查询 
  286.      * @Author haoning 
  287.      */  
  288.     public void wildcardSearch(String filedStr,String queryStr) throws Exception{  
  289.         File indexDir = new File(DISC_URL);    
  290.         //索引目录    
  291.         Directory dir=FSDirectory.open(indexDir);    
  292.         //根据索引目录创建读索引对象    
  293.         IndexReader reader = IndexReader.open(dir);    
  294.         //搜索对象创建    
  295.         IndexSearcher searcher = new IndexSearcher(reader);  
  296.            
  297.         Term term = new Term(filedStr, queryStr);  
  298.         Query query = new WildcardQuery(term);  
  299.         ScoreDoc[] docs = searcher.search(query, 3).scoreDocs;  
  300.         printScoreDoc(docs, searcher);  
  301.     }  
  302.        
  303.     /** 
  304.      * 分词查询 
  305.      * @Author haoning 
  306.      */  
  307.     public void analyzerSearch(String filedStr,String queryStr) throws Exception{  
  308.         File indexDir = new File(DISC_URL);    
  309.         //索引目录    
  310.         Directory dir=FSDirectory.open(indexDir);    
  311.         //根据索引目录创建读索引对象    
  312.         IndexReader reader = IndexReader.open(dir);    
  313.         //搜索对象创建    
  314.         IndexSearcher searcher = new IndexSearcher(reader);  
  315.            
  316.         QueryParser queryParser = new QueryParser(version, filedStr, analyzer);  
  317.         Query query = queryParser.parse(queryStr);  
  318.            
  319.         ScoreDoc[] docs = searcher.search(query, 3).scoreDocs;  
  320.         printScoreDoc(docs, searcher);  
  321.     }  
  322.        
  323.     /** 
  324.      * 多属性分词查询 
  325.      * @Author haoning 
  326.      */  
  327.     public void multiAnalyzerSearch(String[] filedStr,String queryStr) throws Exception{  
  328.         File indexDir = new File(DISC_URL);    
  329.         //索引目录    
  330.         Directory dir=FSDirectory.open(indexDir);    
  331.         //根据索引目录创建读索引对象    
  332.         IndexReader reader = IndexReader.open(dir);    
  333.         //搜索对象创建    
  334.         IndexSearcher searcher = new IndexSearcher(reader);  
  335.         QueryParser queryParser = new MultiFieldQueryParser(version, filedStr, analyzer);  
  336.         Query query = queryParser.parse(queryStr);  
  337.            
  338.         ScoreDoc[] docs = searcher.search(query, 3).scoreDocs;  
  339.         printScoreDoc(docs, searcher);  
  340.     }  
  341.        
  342.     public void printScoreDoc(ScoreDoc[] docs,IndexSearcher searcher)throws Exception{  
  343.         for (int i = 0; i < docs.length; i++) {  
  344.             List<Fieldable> list = searcher.doc(docs[i].doc).getFields();  
  345.             for (Fieldable fieldable : list) {  
  346.                 String fieldName = fieldable.name();  
  347.                 String fieldValue = fieldable.stringValue();  
  348.                 System.out.println(fieldName+" : "+fieldValue);  
  349.             }  
  350.         }  
  351.     }  
  352. }  

注意由于去转盘网(http://www.quzhuanpan.com)是部署到linux上的,所以DISC_URL可以更具系统变换,我是通过url来判定索引文件是否唯一的,你可以更具id来判断,具体情况具体对待吧。

下面是索引部分:

[java]  view plain  copy
  1. package com.tray.indexData;  
  2.    
  3. import java.sql.SQLException;  
  4. import org.apache.lucene.document.Document;  
  5. import org.apache.lucene.document.Field;  
  6. import com.mysql.jdbc.Connection;  
  7. import com.mysql.jdbc.ResultSet;  
  8. import com.mysql.jdbc.Statement;  
  9.    
  10. public class IndexFile {  
  11.        
  12.      private static Connection conn = null;       
  13.      private static Statement stmt = null;    
  14.      private final int NUM=500000;  
  15.      private LuceneSearch ls;  
  16.      private long count=0;  
  17.         
  18.      public ResultSet deal6SourceTable(String tableName) throws SQLException{  
  19.            String sql = "SELECT distinct `NAME`,SKYDRIVER_NAME,USERNAME,SHARE_TIME,DESCRIB,TYPE_ID,ID,URL FROM "+tableName+" where STATUS=1 and TYPE_ID !='-1' and (TYPE_NAME is null or TYPE_NAME!=1) limit "+NUM;  
  20.            //System.out.println(sql);  
  21.            ResultSet rs = (ResultSet) stmt.executeQuery(sql);  
  22.            return rs;  
  23.      }  
  24.         
  25.      public void update6SourceTable(String tableName) throws SQLException{  
  26.            Statement st = (Statement) conn.createStatement();  
  27.            String sql = "update "+tableName+" set TYPE_NAME=1 where STATUS=1 and TYPE_ID !='-1' and (TYPE_NAME is null or TYPE_NAME!=1) limit "+NUM;  
  28.            //System.out.println("update"+sql);  
  29.             try {  
  30.                 st.executeUpdate(sql);  
  31.             } catch (SQLException e) {  
  32.                 e.printStackTrace();  
  33.             }  
  34.      }  
  35.         
  36.      public void indexInit(){//数据库+lcene初始化  
  37.         conn = (Connection) JdbcUtil.getConnection();       
  38.         if(conn == null) {       
  39.             try {  
  40.                 throw new Exception("数据库连接失败!");  
  41.             } catch (Exception e) {  
  42.                 e.printStackTrace();  
  43.             }       
  44.         }  
  45.         ls=new LuceneSearch();  
  46.         try {  
  47.             ls.init();  
  48.         } catch (Exception e2) {  
  49.             e2.printStackTrace();  
  50.         }  
  51.      }  
  52.         
  53.      public void indexEnd(){//数据库+lcene关闭  
  54.             
  55.          ls.closeWriter();  
  56.          try {  
  57.                 conn.close();//关闭数据库  
  58.              } catch (SQLException e) {  
  59.                 e.printStackTrace();  
  60.           }  
  61.      }  
  62.         
  63.      public void Index6Data() throws SQLException{     
  64.             try {  
  65.                 stmt = (Statement) conn.createStatement();  
  66.             } catch (SQLException e1) {  
  67.                 e1.printStackTrace();  
  68.             }  
  69.                
  70.             ResultSet r1=null;  
  71.             ResultSet r2=null;  
  72.             ResultSet r3=null;  
  73.             ResultSet r4=null;  
  74.             ResultSet r5=null;  
  75.             ResultSet r6=null;  
  76.                
  77.             boolean stop=false;  
  78.             do{  
  79.                  r1=deal6SourceTable("film_and_tv_info");  
  80.                  stop=this.createIndex(r1,ls,"1");   //给数据库创建索引,此处执行一次,不要每次运行都创建索引,以后数据有更新可以后台调用更新索引   
  81.                  if(!stop){  
  82.                      ls.commit();//加个判断条件  
  83.                  }  
  84.                  //System.out.println("stop"+stop);  
  85.                    
  86.             }while(!stop);  
  87.               
  88.             stop=false;  
  89.             do{  
  90.                  r2=deal6SourceTable("music_and_mv_info");  
  91.                  stop=this.createIndex(r2,ls,"2");   //给数据库创建索引,此处执行一次,不要每次运行都创建索引,以后数据有更新可以后台调用更新索引    
  92.                  if(!stop){  
  93.                      ls.commit();//加个判断条件  
  94.                  }  
  95.                    
  96.             }while(!stop);  
  97.                
  98.             stop=false;  
  99.             do{  
  100.                  r3=deal6SourceTable("e_book_info");  
  101.                  stop=this.createIndex(r3,ls,"3");   //给数据库创建索引,此处执行一次,不要每次运行都创建索引,以后数据有更新可以后台调用更新索引    
  102.                  if(!stop){  
  103.                      ls.commit();//加个判断条件  
  104.                  }  
  105.                    
  106.             }while(!stop);  
  107.                
  108.             stop=false;  
  109.             do{  
  110.                  r4=deal6SourceTable("bt_file_info");  
  111.                  stop=this.createIndex(r4,ls,"4");   //给数据库创建索引,此处执行一次,不要每次运行都创建索引,以后数据有更新可以后台调用更新索引    
  112.                  if(!stop){  
  113.                      ls.commit();//加个判断条件  
  114.                  }  
  115.                    
  116.             }while(!stop);  
  117.                
  118.             stop=false;  
  119.             do{  
  120.                  r5=deal6SourceTable("characteristic_software_info");  
  121.                  stop=this.createIndex(r5,ls,"5");   //给数据库创建索引,此处执行一次,不要每次运行都创建索引,以后数据有更新可以后台调用更新索引    
  122.                  if(!stop){  
  123.                      ls.commit();//加个判断条件  
  124.                  }  
  125.                    
  126.             }while(!stop);  
  127.                
  128.             stop=false;  
  129.             do{  
  130.                  r6=deal6SourceTable("source_code_info");  
  131.                  stop=this.createIndex(r6,ls,"6");   //给数据库创建索引,此处执行一次,不要每次运行都创建索引,以后数据有更新可以后台调用更新索引    
  132.                  if(!stop){  
  133.                      ls.commit();//加个判断条件  
  134.                  }  
  135.                    
  136.             }while(!stop);  
  137.             stop=false;  
  138.               
  139.      }  
  140.         
  141.      public ResultSet deal2Share(String tableName) throws SQLException{  
  142.         String sql = "SELECT  distinct NAME,SKYDRIVER_NAME,USERNAME,SHARE_TIME,DESCRIB,TYPE_ID,ID,SHORTURL from "+tableName+" where STATUS=1  and FS_ID ='1' limit "+NUM; //利用FS_ID这个字段,没什么用处   
  143.         ResultSet rs = (ResultSet) stmt.executeQuery(sql);  
  144.         return rs;  
  145.     }  
  146.        
  147.     public ResultSet deal3Share(String tableName) throws SQLException{  
  148.         String sql = "SELECT  distinct title,channel,uid,ctime,description,port,id,shorturl from "+tableName+" where name ='1' limit "+NUM;    
  149.         ResultSet rs = (ResultSet) stmt.executeQuery(sql);  
  150.         return rs;  
  151.     }  
  152.        
  153.     public void Index3Data() throws SQLException{  
  154.             try {  
  155.                 stmt = (Statement) conn.createStatement();  
  156.             } catch (SQLException e1) {  
  157.                 e1.printStackTrace();  
  158.             }  
  159.                
  160.             ResultSet r1=null;  
  161.             ResultSet r2=null;  
  162.             ResultSet r3=null;  
  163.                
  164.             boolean stop=false;  
  165.             do{  
  166.                  r1=deal2Share("share1");  
  167.                  stop=this.createIndex(r1,ls,"7");   //给数据库创建索引,此处执行一次,不要每次运行都创建索引,以后数据有更新可以后台调用更新索引    
  168.                  if(!stop){  
  169.                      ls.commit();//加个判断条件  
  170.                  }  
  171.                  //System.out.println("stop"+stop);  
  172.                    
  173.             }while(!stop);  
  174.               
  175.             stop=false;  
  176.             do{  
  177.                  r2=deal2Share("share2");  
  178.                  stop=this.createIndex(r2,ls,"8");   //给数据库创建索引,此处执行一次,不要每次运行都创建索引,以后数据有更新可以后台调用更新索引    
  179.                  if(!stop){  
  180.                      ls.commit();//加个判断条件  
  181.                  }  
  182.                    
  183.             }while(!stop);  
  184.                
  185.             stop=false;  
  186.             do{  
  187.                  r3=deal3Share("share3");  
  188.                  stop=this.createIndex(r3,ls,"9");   //给数据库创建索引,此处执行一次,不要每次运行都创建索引,以后数据有更新可以后台调用更新索引    
  189.                  if(!stop){  
  190.                      ls.commit();//加个判断条件  
  191.                  }  
  192.                    
  193.             }while(!stop);  
  194.             stop=false;  
  195.         }  
  196.        
  197.         public void update2ShareTable(String tableName) throws SQLException{  
  198.             Statement st = (Statement) conn.createStatement();  
  199.            String sql = "update "+tableName+" set FS_ID=0 where STATUS=1  and FS_ID ='1' limit "+NUM; //利用FS_ID这个字段,没什么用处   
  200.            //System.out.println("update"+sql);  
  201.             try {  
  202.                 st.executeUpdate(sql);  
  203.             } catch (SQLException e) {  
  204.                 e.printStackTrace();  
  205.             }  
  206.         }  
  207.            
  208.         public void update3ShareTable(String tableName) throws SQLException{  
  209.             Statement st = (Statement) conn.createStatement();  
  210.            String sql = "update "+tableName+" set name=0 where name ='1' limit "+NUM;    
  211.            //System.out.println("update"+sql);  
  212.             try {  
  213.                 st.executeUpdate(sql);  
  214.             } catch (SQLException e) {  
  215.                 e.printStackTrace();  
  216.             }  
  217.         }  
  218.               
  219.         public boolean createIndex(ResultSet rs,LuceneSearch ls,String mark) {  
  220.             try {  
  221.                 String tableName=null;  
  222.                 if(mark.equals("1")){  
  223.                     tableName="film_and_tv_info";  
  224.                 }  
  225.                 if(mark.equals("2")){  
  226.                     tableName="music_and_mv_info";  
  227.                 }  
  228.                 if(mark.equals("3")){  
  229.                     tableName="e_book_info";  
  230.                 }  
  231.                 if(mark.equals("4")){  
  232.                     tableName="bt_file_info";  
  233.                 }  
  234.                 if(mark.equals("5")){  
  235.                     tableName="characteristic_software_info";  
  236.                 }  
  237.                 if(mark.equals("6")){  
  238.                     tableName="source_code_info";  
  239.                 }  
  240.                 if(mark.equals("7")){  
  241.                     tableName="share1";  
  242.                 }  
  243.                 if(mark.equals("8")){  
  244.                     tableName="share2";  
  245.                 }  
  246.                 if(mark.equals("9")){  
  247.                     tableName="share3";  
  248.                 }  
  249.    
  250.                 boolean isNull=rs.next();  
  251.                 //System.out.println("hehe"+isNull);  
  252.                 if(isNull==false){  
  253.                     return true;//处理完毕  
  254.                 }  
  255.                 while(isNull){  
  256.                     if(Integer.parseInt(mark)>=1&&Integer.parseInt(mark)<=8){  
  257.                         Document doc = new Document();    
  258.                         //System.out.println("name"+rs.getString("NAME"));          
  259.                         Field name = new Field("name",rs.getString("NAME"),Field.Store.YES,Field.Index.ANALYZED);  
  260.                         String skName=rs.getString("SKYDRIVER_NAME");  
  261.                         if(skName==null){  
  262.                             skName="百度";  
  263.                         }  
  264.                         Field skydirverName = new Field("skydirverName",skName, Field.Store.YES,Field.Index.NOT_ANALYZED);  
  265.                         Field username = new Field("username",rs.getString("USERNAME"),Field.Store.YES, Field.Index.ANALYZED);      
  266.                         Field shareTime = new Field("shareTime",rs.getString("SHARE_TIME"), Field.Store.YES,Field.Index.NOT_ANALYZED);  
  267.                         String desb=rs.getString("DESCRIB");  
  268.                         if(desb==null){  
  269.                             desb="-1";  
  270.                         }  
  271.                         Field describ = new Field("describ",desb,Field.Store.NO,Field.Index.NOT_ANALYZED);       
  272.                         Field typeId = new Field("typeId",rs.getString("TYPE_ID"), Field.Store.YES,Field.Index.NOT_ANALYZED);   
  273.                         Field id = new Field("id",rs.getString("ID"),Field.Store.YES,Field.Index.NOT_ANALYZED);  
  274.                         Field url =null;  
  275.                         if(Integer.parseInt(mark)>=7&&Integer.parseInt(mark)<=8){  
  276.                             url = new Field("url",rs.getString("SHORTURL"), Field.Store.YES,Field.Index.ANALYZED);   
  277.                         }  
  278.                         else{  
  279.                             url = new Field("url",rs.getString("URL"), Field.Store.YES,Field.Index.ANALYZED);    
  280.                         }  
  281.                         doc.add(name);  
  282.                         doc.add(skydirverName);  
  283.                         doc.add(username);  
  284.                         doc.add(shareTime);  
  285.                         doc.add(describ);  
  286.                         doc.add(typeId);  
  287.                         doc.add(id);  
  288.                         doc.add(url);  
  289.                         ls.singleUpdate(doc);//用跟新更为合适       
  290.                         isNull=rs.next();  
  291.                     }  
  292.                     else{  
  293.                         Document doc = new Document();    
  294.                         //System.out.println("title"+rs.getString("title"));          
  295.                         Field name = new Field("name",rs.getString("title"),Field.Store.YES,Field.Index.ANALYZED);  
  296.                         String skName=rs.getString("channel");  
  297.                         Field skydirverName = new Field("skydirverName",skName, Field.Store.YES,Field.Index.NOT_ANALYZED);  
  298.                         Field username = new Field("username",rs.getString("uid"),Field.Store.YES, Field.Index.ANALYZED);       
  299.                         Field shareTime = new Field("shareTime",rs.getString("ctime"), Field.Store.YES,Field.Index.NOT_ANALYZED);  
  300.                         String desb=rs.getString("description");  
  301.                         if(desb==null){  
  302.                             desb="-1";  
  303.                         }  
  304.                         Field describ = new Field("describ",desb,Field.Store.NO,Field.Index.NOT_ANALYZED);       
  305.                         Field typeId = new Field("typeId",rs.getString("port"), Field.Store.YES,Field.Index.NOT_ANALYZED);  
  306.                         Field id = new Field("id",rs.getString("id"),Field.Store.YES,Field.Index.NOT_ANALYZED);      
  307.                         Field url = new Field("url",rs.getString("shorturl"), Field.Store.YES,Field.Index.ANALYZED);    
  308.                            
  309.                         doc.add(name);  
  310.                         doc.add(skydirverName);  
  311.                         doc.add(username);  
  312.                         doc.add(shareTime);  
  313.                         doc.add(describ);  
  314.                         doc.add(typeId);  
  315.                         doc.add(id);  
  316.                         doc.add(url);  
  317.                         ls.singleUpdate(doc);//用跟新更为合适       
  318.                         isNull=rs.next();  
  319.                     }  
  320.                     count=count+1;  
  321.                 }  
  322.                 if(Integer.parseInt(mark)>=1&&Integer.parseInt(mark)<=6){  
  323.                     update6SourceTable(tableName);//处理完成后做标志  
  324.                 }  
  325.                 else if(Integer.parseInt(mark)>=7&&Integer.parseInt(mark)<=8){  
  326.                     update2ShareTable(tableName);//处理完成后做标志  
  327.                 }  
  328.                 else{  
  329.                     update3ShareTable(tableName);//处理完成后做标志  
  330.                 }  
  331.                 System.out.println("Has index "+count+"条数据,数据来自表"+tableName);  
  332.                    
  333.             } catch (Exception e) {  
  334.                 e.printStackTrace();  
  335.             }  
  336.             return false;  
  337.         }  
  338. }  

数据库之类的请不要关心,看思路即可,你如果需要换成你的即可,这里就不多说了。看最后的部分:

[java]  view plain  copy
  1. package com.tray.indexData;  
  2.    
  3. import java.sql.SQLException;  
  4.    
  5.    
  6.    
  7. public class Application {  
  8.        
  9.     public static void main(String[] args){  
  10.         /*IndexFile indexFile=new IndexFile(); 
  11.         indexFile.indexInit(); 
  12.         try { 
  13.             indexFile.Index6Data(); 
  14.         } catch (SQLException e1) { 
  15.             e1.printStackTrace(); 
  16.         } 
  17.         indexFile.indexEnd();*/  
  18.            
  19.         IndexFile indexFile1=new IndexFile();  
  20.         indexFile1.indexInit();  
  21.         try {  
  22.             indexFile1.Index3Data();  
  23.         } catch (SQLException e1) {  
  24.             e1.printStackTrace();  
  25.         }  
  26.         indexFile1.indexEnd();  
  27.            
  28.         LuceneSearch lch=new LuceneSearch();  
  29.         try {  
  30.             long a = System.currentTimeMillis();  
  31.             lch.highLightSearch("name""flv"1,3);  
  32.             long b = System.currentTimeMillis();  
  33.             long c = b - a;  
  34.             System.out.println("[高级检索花费时间:" + c + "毫秒]");  
  35.         } catch (Exception e) {  
  36.             e.printStackTrace();  
  37.         }  
  38.     }  
  39. }  
import WordSegment.*; import java.awt.event.ActionEvent; import java.awt.event.ActionListener; import java.awt.*; import java.io.File; import java.util.Vector; import javax.swing.*; /** * */ /** * @author Truman * */ public class WordSegDemoFrame extends JFrame implements ActionListener { final static int ALGO_FMM = 1; final static int ALGO_BMM = 2; private JMenuBar menuBar = new JMenuBar(); private JMenuItem openDicItem, closeItem; private JRadioButtonMenuItem fmmItem, bmmItem; private JMenuItem openTrainFileItem, saveDicItem, aboutItem; private JButton btSeg; private JTextField tfInput; private JTextArea taOutput; private JPanel panel; JLabel infoDic, infoAlgo; private WordSegment seger; private DicTrainer trainer = new DicTrainer(); private void initFrame() { setTitle("Mini分词器"); setDefaultCloseOperation(EXIT_ON_CLOSE); setJMenuBar(menuBar); JMenu fileMenu = new JMenu("文件"); JMenu algorithmMenu = new JMenu("分词算法"); JMenu trainMenu = new JMenu("训练语料"); JMenu helpMenu = new JMenu("帮助"); openDicItem = fileMenu.add("载入词典"); fileMenu.addSeparator(); closeItem = fileMenu.add("退出"); algorithmMenu.add(fmmItem = new JRadioButtonMenuItem("正向最大匹配", true)); algorithmMenu.add(bmmItem = new JRadioButtonMenuItem("逆向最大匹配", false)); ButtonGroup algorithms = new ButtonGroup(); algorithms.add(fmmItem); algorithms.add(bmmItem); openTrainFileItem = trainMenu.add("载入并训练语料"); saveDicItem = trainMenu.add("保存词典"); aboutItem = helpMenu.add("关于Word Segment Demo"); menuBar.add(fileMenu); menuBar.add(algorithmMenu); menuBar.add(trainMenu); menuBar.add(helpMenu); openDicItem.addActionListener(this); closeItem.addActionListener(this); openTrainFileItem.addActionListener(this); saveDicItem.addActionListener(this); aboutItem.addActionListener(this); fmmItem.addActionListener(this); bmmItem.addActionListener(this); JPanel topPanel = new JPanel(); topPanel.setLayout(new FlowLayout());
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值