[size=xx-large][color=red][b]测试代码:[/b][/color][/size]
[url]http://yunpan.cn/Qb93GuJDtIUL5[/url]
package com.tika.test;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.Version;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.microsoft.OfficeParser;
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
import org.apache.tika.sax.BodyContentHandler;
import org.wltea.analyzer.lucene.IKAnalyzer;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
public class IndexUtil {
public static void main(String[] args) throws ParseException, IOException, TikaException
{
//index();//创建索引
//System.out.println("ids="+searche("谷歌"));//查询索引
File f = new File("C:/高军威.xls");
//tikaTool(f);
System.out.println(fileToTxt(f));
}
public static String tikaTool(File f) throws IOException, TikaException {
Tika tika = new Tika();
Metadata metadata = new Metadata();
String str = tika.parseToString(new FileInputStream(f),metadata);
for(String name:metadata.names() ) {
System.out.println(name+":"+metadata.get(name));
}
return str;
}
public static String fileToTxt(File f) {
//Parser parser = new OOXMLParser();
//Parser parser = new PDFParser();//PDF 内容获得
//Parser parser = new HtmlParser(); //网页信息获得
//Parser parser = new OOXMLParser(); //2010 office用这个
//Parser parser = new OfficeParser(); //2003以下用这个
Parser parser = new AutoDetectParser(); //程序自动检测parser
InputStream is = null;
try {
Metadata metadata = new Metadata();
metadata.add(Metadata.CONTENT_ENCODING, "utf-8");//html是 设置 防止乱码
metadata.set(Metadata.RESOURCE_NAME_KEY, f.getName());
is = new FileInputStream(f);
//Workbook wb =new HSSFWorkbook(is);
//System.out.println(wb.getSheetAt(0).getRow(0).getCell(0).getStringCellValue());
ContentHandler handler = new BodyContentHandler();
ParseContext context = new ParseContext();
context.set(Parser.class,parser);
parser.parse(is,handler, metadata,context);
for(String name:metadata.names()) {
System.out.println(name+":"+metadata.get(name));
}
return handler.toString();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (SAXException e)
{
// TODO Auto-generated catch block
e.printStackTrace();
} catch (TikaException e)
{
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
try {
if(is!=null) is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return null;
}
public static String searche(String searchString) throws ParseException, IOException
{
IKAnalyzer analyzer = new IKAnalyzer();
String[] fields = {"content"};
QueryParser parser = new MultiFieldQueryParser(Version.LUCENE_40 ,fields,analyzer);
Query q2 = parser.parse(searchString);
Directory dir = FSDirectory.open(new File("d:/lucene"));
IndexReader indexReader = DirectoryReader.open(dir);
IndexSearcher indexSearcher = new IndexSearcher(indexReader);
ScoreDoc[] docs = indexSearcher.search(q2,6000).scoreDocs;
String dd ="";
if(docs.length>0){
Document document = indexSearcher.doc(docs[0].doc);
dd = document.get("ids");
}
return dd;
}
public static void index() {
try {
File f = new File("C:/ITeye.pdf");
IKAnalyzer analyzer = new IKAnalyzer();
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_40,analyzer);
FieldType ft = new FieldType();
ft.setStored(false); // 设置是否进行存储
ft.setIndexed(true); // 设置是否能够索引到
ft.setTokenized(true);// 设置是否进行分词分析
FieldType ft2 = new FieldType();
ft2.setStored(true); // 设置是否进行存储
ft2.setIndexed(true); // 设置是否能够索引到
ft2.setTokenized(false);// 设置是否进行分词分析
Directory dir = FSDirectory.open(new File("d:/lucene"));
IndexWriter writer = new IndexWriter(dir,indexWriterConfig);
writer.deleteAll();
Document doc = new Document();
Field field1 = new Field("content",new Tika().parse(f),ft2);
Field field2 = new Field("ids","110",ft2);
doc.add(field1);
doc.add(field2);
writer.addDocument(doc);
writer.close();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (LockObtainFailedException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}finally{
System.out.println("索引创建成功!!");
}
}
}
[url]http://yunpan.cn/Qb93GuJDtIUL5[/url]