lucene 索引非txt文档 (pdf word rtf html xml)_mincartf opcartf-优快云博客

本文介绍了一种基于Java的索引方法，用于将多种格式的文档（如txt、Word、PDF、RTF）转化为纯文本并进行索引。通过使用Java世界中的开源工程，实现了一套简单的索引流程，包括文件转换、文本提取及索引写入。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

搜索要首先要索引，索引的话最简单的方式是索引txt文件，上文已经介绍了。这里介绍一下一些其它格式的文档的索引，例如ms word ,pdf ,rtf等。

索引方法：就是先把各种文档先转化成纯文本再索引，所以关键在转换上。幸好java世界中有太多的开源工程，很多都可以拿来直接使用。

下面是根据网上资料做的一个实现索引的一个例子，简单的实现了索引富文本格式的文件：

Calendar begin = Calendar.getInstance();
  //保存索引文件的地方
  String indexDir = "D:\\indexDir";
  //将要搜索txt文件的地方
  String dateDir = "D:\\dateDir";
  IndexWriter indexWriter = null;
  //创建Directory对象
  Directory dir = new SimpleFSDirectory(new File(indexDir));
  //创建IndexWriter对象第一个参数是Directory 第二个是分词器第三个表示是否创建
  //如果是false表示在此基础上进行修改，第四个表示分词的最大值，比如说 new MaxFieldlength(2)
  //就表示两个字一分   一般用IndexWriter.MaxFiledLength.LIMITED
  indexWriter = new IndexWriter(dir,new StandardAnalyzer(Version.LUCENE_29),true,IndexWriter.MaxFieldLength.LIMITED);

  File[] files = new File(dateDir).listFiles();
  for (int i = 0; i < files.length; i++) {
   Document doc = new Document();
   String subfix = files[i].getName().substring(files[i].getName().lastIndexOf("."));

   if (subfix.equals(".doc")) {
       //这是2003版读取文本方法   图片不会被读取    表格的内容会在表格所在的位置每个单元格之间用一个进行分割
    //每行的末尾有两个进行分割表示行尾结束
    WordExtractor we = new WordExtractor(new FileInputStream(files[i]));
    String text2003 = we.getText();

    //创建Field对象，并放入doc对象中
    doc.add(new Field("contents", text2003,Field.Store.YES,Field.Index.ANALYZED_NO_NORMS));
    doc.add(new Field("filename", files[i].getName(),Field.Store.YES,Field.Index.ANALYZED_NO_NORMS));
    doc.add(new Field("indexDate", DateTools.dateToString(new Date(), DateTools.Resolution.DAY),Field.Store.YES,Field.Index.ANALYZED_NO_NORMS));
   }else if (subfix.equals(".docx")) {
    //这是2007版读取文本方法    图片不会被读取    表格中内容会被放到字符串最后多个表格的话会依次列出
    OPCPackage opcPackage = POIXMLDocument.openPackage(files[i].getPath());
    POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);
    String text2007 = extractor.getText();

    //创建Field对象，并放入doc对象中
    doc.add(new Field("contents", text2007,Field.Store.YES,Field.Index.ANALYZED_NO_NORMS));
    doc.add(new Field("filename", files[i].getName(),Field.Store.YES,Field.Index.ANALYZED_NO_NORMS));
    doc.add(new Field("indexDate", DateTools.dateToString(new Date(), DateTools.Resolution.DAY),Field.Store.YES,Field.Index.ANALYZED_NO_NORMS));
   }else if (subfix.equals(".pdf")) {
    COSDocument cosDoc = null;
    PDFParser parser = new PDFParser(new FileInputStream(files[i]));
    parser.parse();
    cosDoc = parser.getDocument();
    if (cosDoc.isEncrypted()) {
    }

    PDFTextStripper stripper = new PDFTextStripper();
    String textPDF = stripper.getText(new PDDocument(cosDoc));

    //创建Field对象，并放入doc对象中
    doc.add(new Field("contents", textPDF,Field.Store.YES,Field.Index.ANALYZED_NO_NORMS));
    doc.add(new Field("filename", files[i].getName(),Field.Store.YES,Field.Index.ANALYZED_NO_NORMS));
    doc.add(new Field("indexDate", DateTools.dateToString(new Date(), DateTools.Resolution.DAY),Field.Store.YES,Field.Index.ANALYZED_NO_NORMS));

   }else if (subfix.equals(".rtf")) {
    //javax 读取RTF内容
    DefaultStyledDocument styleDoc = new DefaultStyledDocument();
    new RTFEditorKit().read(new FileInputStream(files[i]), styleDoc, 0);
    String textRtf = styleDoc.getText(0, styleDoc.getLength());

    //创建Field对象，并放入doc对象中
    doc.add(new Field("contents", textRtf,Field.Store.YES,Field.Index.ANALYZED_NO_NORMS));
    doc.add(new Field("filename", files[i].getName(),Field.Store.YES,Field.Index.ANALYZED_NO_NORMS));
    doc.add(new Field("indexDate", DateTools.dateToString(new Date(), DateTools.Resolution.DAY),Field.Store.YES,Field.Index.ANALYZED_NO_NORMS));
   }else {
    //创建Field对象，并放入doc对象中
    doc.add(new Field("contents", new FileReader(files[i])));
    doc.add(new Field("filename", files[i].getName(),Field.Store.YES,Field.Index.ANALYZED_NO_NORMS));
    doc.add(new Field("indexDate", DateTools.dateToString(new Date(), DateTools.Resolution.DAY),Field.Store.YES,Field.Index.ANALYZED_NO_NORMS));
   }
   //写入IndexWriter
   indexWriter.addDocument(doc);
  }

   //查看IndexWriter里面有多少个索引
  System.out.println("numDocs"+indexWriter.numDocs());
  //indexWriter.commit(); //如果indexWriter不commit活着不close 会致使添加的文档不能够被打开的IndexReader或IndexSearcher看到导致检索不出内容
  indexWriter.close();
  Calendar end = Calendar.getInstance();

  System.out.println("使用了："+(end.getTimeInMillis()-begin.getTimeInMillis())+"毫秒！！！");