LuceneAPI的学习笔记

最新推荐文章于 2025-09-05 16:34:31 发布

转载最新推荐文章于 2025-09-05 16:34:31 发布 · 91 阅读

0 ·

CC 4.0 BY-SA版权

原文链接：https://my.oschina.net/moonroot/blog/806100

文章标签：

#java #python #数据库

本文介绍Lucene中不同类型的字段属性及多种分词器的使用方法，并演示了如何进行单词、短语、通配符等各类查询操作。

2019独角兽企业重金招聘Python工程师标准>>>

Lucene的常用属性：
1、Filed

三种常用的字段属性Field :TextFiled、StringFiled、StoredFiled;
TextFiled 常用于要分词、要搜索、要存储的数据库字段
StringFiled：常用于分词、搜索、不需要存储的数据库字段

StoredFiled:只存储的数据库字段

2分词器

    private String en = "oh my lady gaga";
   private String cn = "迅雷不及掩耳盗铃儿响叮当仁不让";
   private String str = "学习使我进步FullText Search Lucene学习的好好";
   public void testAnalyzer(Analyzer analyzer,String str) throws Exception {
       TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(str));
       //在读取单元池需要重置
       tokenStream.reset();
       while(tokenStream.incrementToken()){
           System.out.println(tokenStream);
       }
   }

   /**
   * 标准分词
   * @throws Exception
   */
   @Test
   public void testStandardAnalyzer() throws Exception {
       testAnalyzer(new StandardAnalyzer(), cn);
   }

   /**
   * 简单分词
   * @throws Exception
   */
   @Test
   public void testSimpleAnalyzer() throws Exception {
       testAnalyzer(new SimpleAnalyzer(), cn);
   }

   /**
   * 二分分词
   * @throws Exception
   */
   @Test
   public void testCJKAnalyzer() throws Exception {
       testAnalyzer(new CJKAnalyzer(), cn);
   }

   /**
   * 词典分词
   * @throws Exception
   */
   @Test
   public void testSmartCnAnalyzer() throws Exception {
       CharArraySet stopWords = new CharArraySet(10, true);
       stopWords.add("的");
       testAnalyzer(new SmartChineseAnalyzer(stopWords ), str);
   }

   /**
   * IK分词
   * @throws Exception
   */
   @Test
   public void testIKAnalyzer() throws Exception {
       testAnalyzer(new IKAnalyzer(true), str);
   }

//查询

/**
* 索引的添删改
* @author admin
*
*/
public class QueryTestDemo {

   // 准备数据源
   private String doc1 = "hello world";
   private String doc2 = "hello java world";
   private String doc3 = "hello lucene world";

   // 索引目录
   private String dirPath = "G:\\Sirius\\soft\\eclipse\\workspace\\Lucene\\queryIndex";

   /**
   * 索引创建
   * @throws Exception
   */
   @Test
   public void createIndex() throws Exception {
       Directory directory = FSDirectory.open(Paths.get(dirPath));
       IKAnalyzer analyzer = new IKAnalyzer();
       IndexWriterConfig config = new IndexWriterConfig(analyzer);
       config.setOpenMode(OpenMode.CREATE); // 代表每次都新建（测试）

       IndexWriter indexWriter = new IndexWriter(directory, config);

       Document document1= new Document();
       document1.add(new TextField("id", "1",Store.YES));
       document1.add(new TextField("title", "doc1",Store.YES));
       document1.add(new TextField("content", doc1,Store.YES));
       document1.add(new TextField("inputtime", "20160812",Store.YES));
       indexWriter.addDocument(document1);

       Document document2= new Document();
       document2.add(new TextField("id", "2",Store.YES));
       document2.add(new TextField("title", "doc2",Store.YES));
       document2.add(new TextField("content", doc2,Store.YES));
       document2.add(new TextField("inputtime", "20160813",Store.YES));
       indexWriter.addDocument(document2);

       Document document3= new Document();
       document3.add(new TextField("id", "3",Store.YES));
       document3.add(new TextField("title", "doc3",Store.YES));
       document3.add(new TextField("content", doc3,Store.YES));
       document3.add(new TextField("inputtime", "20160814",Store.YES));
       indexWriter.addDocument(document3);

//       indexWriter.commit();
       indexWriter.close();

   }

   /**
   * 单词搜索
   */
   @Test
   public void testTermQuery() throws Exception {
       search("content:hello");
       System.out.println("==========================================");
       TermQuery query = new TermQuery(new Term("content","hello"));
       search(query );
   }

   /**
   * 短语搜索
   */
   @Test
   public void testPhraseQuery() throws Exception {
       search("\"hello world\"");
       System.out.println("==========================================");
       PhraseQuery.Builder builder = new PhraseQuery.Builder();
       builder.add(new Term("content","hello"));
       builder.add(new Term("content","world"));

       PhraseQuery query = builder.build();
       search(query );
   }

   /**
   * 通配符搜索
   *    ? : 1个任意字符（站位）
   * * : 0~N个任意字符
   * @throws Exception
   */
   @Test
   public void testWildcardQuery() throws Exception {
       search("lu*n?");
       System.out.println("==========================================");
       WildcardQuery query = new WildcardQuery(new Term("content","lu*n?"));
       search(query);
   }

   /**
   * 模糊搜索
   *    在单字搜索的基础上，后面跟一个~【0，2】整数，代表最大容错数
   * @throws Exception
   */
   @Test
   public void testQuery() throws Exception {
       search("luXenX~2");
       System.out.println("==========================================");
       FuzzyQuery query = new FuzzyQuery(new Term("content","luXenX"));
       search(query);
   }

   /**
   * 临近查询
   * 在“短语”搜索的基础上，后面跟一个~【0，100】整数，代表最大间隔数
   * @throws Exception
   */
   @Test
   public void testPhraseQuery2() throws Exception {
       search("\"hello world\"~2");
       System.out.println("==========================================");

       PhraseQuery.Builder builder = new PhraseQuery.Builder();
       builder.add(new Term("content","hello"));
       builder.add(new Term("content","world"));
       builder.setSlop(2);// 最大间隔数
       PhraseQuery query = builder.build();

       search(query);

   }

   /**
   * 匹配所有
   * @throws Exception
   */
   @Test
   public void testMatchAll() throws Exception {
       search("*:*");
       System.out.println("==========================================");
       search(new MatchAllDocsQuery());
   }

   /**
   * 范围查询
   * @throws Exception
   */
   @Test
   public void testTermRangeQuery() throws Exception {
//       search("inputtime:[20160812 TO 20160814]");
//       search("inputtime:{20160812 TO 20160814}");
       search("inputtime:{20160812 TO 20160814]");
       System.out.println("==========================================");
       TermRangeQuery query = new TermRangeQuery("inputtime", new BytesRef("20160812"), new BytesRef("20160814"), false, true);
       search(query);
   }
   //组合查询
   @Test
   public void testBooleanQuery() throws Exception {
       search("+content:java -inputtime:[20160812 TO 20160814}");
       System.out.println("==========================================");
       Builder builder = new BooleanQuery.Builder();

       Query qo1 = new TermQuery(new Term("content","java")); // 2,3
       builder.add(qo1 , Occur.MUST);

       TermRangeQuery qo2 = new TermRangeQuery("inputtime", new BytesRef("20160812"), new BytesRef("20160814"), true, false); // 1,2
       builder.add(qo2, Occur.MUST_NOT);

       BooleanQuery query = builder.build();
       search(query);
   }

   public void search(String searchKey) throws Exception {
       // 指定索引目录
       Directory directory = new SimpleFSDirectory(Paths.get(dirPath));
       // 创建一个索引读取器
       IndexReader indexReader = DirectoryReader.open(directory);
       // 索引搜索器
       IndexSearcher indexSearcher = new IndexSearcher(indexReader);
       // 创建分词器
       Analyzer analyzer = new IKAnalyzer();
       // 创建查询解析器
       QueryParser queryParser = new QueryParser("content", analyzer);
       // 创建查询对象
       Query query = queryParser.parse(searchKey);
       System.out.println("对应的查询对象："+query.getClass().getName());
       // 搜索
       TopDocs tds = indexSearcher.search(query, 10000);
       System.out.println("一共符合条件的有：" + tds.totalHits);
       // 遍历集合
       for (ScoreDoc scoreDoc : tds.scoreDocs) {
           // 获取内部文档编号
           int docId = scoreDoc.doc;
           // 通过内部文档编号，获取文档
           Document document = indexSearcher.doc(docId);

           System.out.println("=====docId=====:" + docId);
           System.out.println("=====score=====:" + scoreDoc.score);
           System.out.println("===========>id:" + document.get("id")+",title:" + document.get("title")+",content:" + document.get("content")+",inputtime:" + document.get("inputtime"));
           System.out.println();
       }
   }

   public void search(Query query) throws Exception {
       System.out.println("对应的查询语句：" + query.toString());
       // 指定索引目录
       Directory directory = new SimpleFSDirectory(Paths.get(dirPath));
       // 创建一个索引读取器
       IndexReader indexReader = DirectoryReader.open(directory);
       // 索引搜索器
       IndexSearcher indexSearcher = new IndexSearcher(indexReader);
       // 创建分词器
       Analyzer analyzer = new IKAnalyzer();
       // 创建查询解析器
       // 创建查询对象
       // 搜索
       TopDocs tds = indexSearcher.search(query, 10000);
       System.out.println("一共符合条件的有：" + tds.totalHits);
       // 遍历集合
       for (ScoreDoc scoreDoc : tds.scoreDocs) {
           // 获取内部文档编号
           int docId = scoreDoc.doc;
           // 通过内部文档编号，获取文档
           Document document = indexSearcher.doc(docId);

           System.out.println("=====docId=====:" + docId);
           System.out.println("===========>id:" + document.get("id")+",title:" + document.get("title")+",content:" + document.get("content")+",inputtime:" + document.get("inputtime"));
           System.out.println();
       }
   }

   @Test
   public void testBoost() throws Exception{
           search("java lucene^10");
   }
}

转载于:https://my.oschina.net/moonroot/blog/806100