/**
* com.jiaoyiping.pdstest.TestTika.java
* Copyright (c) 2009 Hewlett-Packard Development Company, L.P.
* All rights reserved.
*/
package com.jiaoyiping.pdstest;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.mail.RFC822Parser;
import org.apache.tika.parser.microsoft.OfficeParser;
import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
import org.xml.sax.ContentHandler;
/**
* <pre>
* Desc:
* @author 焦一平
* @refactor 焦一平
* @date 2014年12月4日 下午1:31:09
* @version 1.0
* @see
* REVISIONS:
* Version Date Author Description
* -------------------------------------------------------------------
* 1.0 2014年12月4日 焦一平 1. Created this class.
* </pre>
*/
public class TestTika {
//解析PDF
@Test
public void testPdf() throws Exception{
Long start = System.currentTimeMillis();
Parser parser = new PDFParser();
InputStream is = new BufferedInputStream(new FileInputStream(new File("D:\\我的微盘\\文档\\参考文档\\Linux Shell脚本攻略.pdf")));
OutputStream os = new BufferedOutputStream(new FileOutputStream(new File("C:\\Users\\Administrator\\Desktop\\result.txt")));
Metadata meta = new Metadata();
meta.add(Metadata.CONTENT_ENCODING, "utf-8");
ContentHandler iHandler = new BodyContentHandler(os);
parser.parse(is, iHandler, meta, new ParseContext());
Long end = System.currentTimeMillis();
Long used = (end-start)/1000;
System.out.println("耗时: "+used+"秒");
}
//解析Word
@Test
public void testWrod() throws Exception{
Long start = System.currentTimeMillis();
Parser parser = new OfficeParser();
InputStream is = new BufferedInputStream(new FileInputStream(new File("D:\\我的微盘\\文档\\参考文档\\jBPM5_用户指南中文版.doc")));
OutputStream os = new BufferedOutputStream(new FileOutputStream(new File("C:\\Users\\Administrator\\Desktop\\result.txt")));
Metadata meta = new Metadata();
meta.add(Metadata.CONTENT_ENCODING, "utf-8");
ContentHandler iHandler = new BodyContentHandler(os);
parser.parse(is, iHandler, meta, new ParseContext());
Long end = System.currentTimeMillis();
Long used = (end-start)/1000;
System.out.println("耗时:"+used+"秒");
}
//解析EMAIL(只能解析标准的eml格式的,不能解析微软的msg格式)
//使用commons-email来进行解析的可以得到收件人、发件人、主题、内容等元数据,TIkA是否支持未尝试
@Test
public void testEmail() throws Exception{
Long start = System.currentTimeMillis();
Parser parser = new RFC822Parser();
InputStream is = new BufferedInputStream(new FileInputStream(new File("C:\\Users\\Administrator\\Downloads\\回复_ RE_ 数据导入工作 - 外部系统枚举与U-Cloud枚举映射.eml")));
OutputStream os = new BufferedOutputStream(new FileOutputStream(new File("C:\\Users\\Administrator\\Desktop\\result.txt")));
Metadata meta = new Metadata();
meta.add(Metadata.CONTENT_ENCODING, "utf-8");
ContentHandler iHandler = new BodyContentHandler(os);
parser.parse(is, iHandler, meta, new ParseContext());
Long end = System.currentTimeMillis();
Long used = (end-start)/1000;
System.out.println("耗时:"+used+"秒");
}
}