解析
import javax.mail.BodyPart;
import javax.mail.Session;
import javax.mail.internet.MimeMessage;
import javax.mail.internet.MimeMultipart;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class MhtFileParser {
public static void main(String[] argv) throws Exception {
File folder = new File("/Users/Downloads/档案数据");
File[] fs = folder.listFiles();
for(int i=0; i<fs.length; i++){
File f = fs[i];
if(!f.getName().endsWith(".mht")){
continue;
}
// System.out.println(f.getName());
if(i == fs.length - 1){
List<String[]> result = testFile(f);
System.out.println(f.getName() + " " + result.size());
for(int k=0; k<result.size(); k++){
System.out.println(k);
String[] data = result.get(k);
for(int j=0; j<data.length; j++){
System.out.println("\t" + data[j]);
}
}
}
}
}
public static List<String[]> testFile(File f) throws Exception{
Properties props = System.getProperties();
Session mailSession = Session.getDefaultInstance(props, null);
InputStream source = new FileInputStream(f);
MimeMessage message = new MimeMessage(mailSession, source);
List<String[]> list = new ArrayList<>();
MimeMultipart body = (MimeMultipart) message.getContent();
for(int i=0; i<body.getCount(); i++){
BodyPart bp = body.getBodyPart(i);
if(bp.getContentType().contains(“text/html”)){
//String html = (String) bp.getContent();
// 用户给的MHT文件中标明的是GB2312编码,但实际为GB18030编码,在遇到部分汉字,例如"旸"时会乱码
// 所以不用bp.getContent()获取,该用inputStream获取
InputStream textStream = bp.getInputStream();
BufferedInputStream bis = new BufferedInputStream(textStream);
InputStreamReader isr = new InputStreamReader(bis, “GB18030”);
BufferedReader br = new BufferedReader(isr);
StringBuffer buf = new StringBuffer();
String line = null;
while ((line = br.readLine()) != null) {
buf.append(line + “\r\n”);
}
br.close();
isr.close();
textStream.close();
String html = buf.toString();
if(html.contains(“清单”)){
//System.out.println(html);
Pattern row = Pattern.compile("<TR[^>]>([\s\S]?)<\/TR>");
Matcher matcher = row.matcher(html);
while(matcher != null && matcher.find()){
String rowHtml = matcher.group(1);
Pattern tds = Pattern.compile("<TD[>]*>([\s\S]*?)<\/TD>[<]<TD[>]*>([\s\S]*?)<\/TD>[<]<TD[>]*>([\s\S]*?)<\/TD>[<]<TD[>]*>([\s\S]*?)<\/TD>[<]<TD[^>]>([\s\S]?)<\/TD>");
Matcher matcherRow = tds.matcher(rowHtml);
while(matcherRow != null && matcherRow.find()) {
String[] item = new String[matcherRow.groupCount()];
for (int j = 1; j <= matcherRow.groupCount(); j++) {
String s = matcherRow.group(j);
String s2 = s.replaceAll(" “, “”);
String s3 = s2.replaceAll(”
", “\r\n”);
item[j - 1] = s3;
}
list.add(item);
}
}
}
}
}
return list;
}
}
//补充解释
MultipartFile mfile
testFile(mfile.getInputStream)
public static List<String[]> testFile(InputStream source) throws Exception{
Properties props = System.getProperties();
Session mailSession = Session.getDefaultInstance(props, null);
MimeMessage message = new MimeMessage(mailSession, source);
}