package testlucene;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.apache.log4j.BasicConfigurator;
import org.apache.log4j.Logger;
import org.htmlcleaner.HtmlCleaner;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
public class SAXxhtml extends DefaultHandler {
/**
* Logger for this class
*/
private static final Logger logger = Logger.getLogger(SAXxhtml.class);
public StringBuffer sb = new StringBuffer();
public boolean usable = true;
private String sPath = "";
public SAXxhtml() {
super();
// TODO Auto-generated constructor stub
// PropertyConfigurator.configure("log4j.properties");
BasicConfigurator.configure();
}
public void startElement(String namespaceURI, String localName,
String rawName, Attributes atts) throws SAXException {
if (rawName.equals("style") || rawName.equals("script")) {
usable = false;
}
}
// 解析完成后的统计工作
public void endDocument() throws SAXException {
try {
PrintWriter pw = new PrintWriter(new FileOutputStream(sPath));
pw.print(sb.toString());
pw.flush();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public void characters(char[] ch, int start, int length) {
String charEncontered = new String(ch, start, length);
/*
* if (!charEncontered.startsWith("<!")||!charEncontered.startsWith("<
* ")) { sb.append("\n"); sb.append(charEncontered); }
*/
if (usable) {
sb.append(charEncontered);
sb.append("\n");
}
usable = true;
}
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
/*
* SAXParserFactory spf = SAXParserFactory.newInstance(); XMLReader
* xmlReader = null; SAXParser saxParser = null; try { //
* 创建一个解析器SAXParser对象 saxParser = spf.newSAXParser(); //
* 得到SAXParser中封装的SAX XMLReader xmlReader = saxParser.getXMLReader();
* saxParser.parse(new File("d:/sina.xml"), new SAXxhtml()); } catch
* (Exception ex) { logger.error("main(String[]) - " + ex, ex);
* System.exit(1); }
*/
}
@Override
public void endElement(String arg0, String arg1, String arg2)
throws SAXException {
// TODO Auto-generated method stub
super.endElement(arg0, arg1, arg2);
}
public void parse(String sPath, String Scontent) {
this.sPath = sPath;
try {
// System.out.println(Scontent);
HtmlCleaner hc = new HtmlCleaner(Scontent);
hc.clean();
PrintWriter pw = new PrintWriter(new FileOutputStream("e:/tmpfile/tmp.txt"));
pw.print(sb.toString());
pw.flush();
pw.close();
FileInputStream fis = new FileInputStream(new File("e:/tmpfile/tmp.txt"));
String mid = hc.getBrowserCompactXmlAsString();
StringReader sr = new StringReader(mid);
InputSource iSrc = new InputSource(sr);
System.out.println(iSrc.toString());
SAXParserFactory spf = SAXParserFactory.newInstance();
XMLReader xmlReader = null;
SAXParser saxParser = null;
// 创建一个解析器SAXParser对象
saxParser = spf.newSAXParser();
// 得到SAXParser中封装的SAX XMLReader
xmlReader = saxParser.getXMLReader();
saxParser.parse(fis, new SAXxhtml());
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (ParserConfigurationException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (SAXException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
具体思路是Html->xml,然后就可以用sax对xml解析,但是程序总调不通,有人能帮助解决一下么?
SAX解析HTML
本文介绍了一个使用SAX解析器将HTML转换为XML的过程,并尝试解析转换后的XML文件。该过程涉及使用HtmlCleaner清理HTML,然后通过SAX解析器读取清理后的XML内容。
5381

被折叠的 条评论
为什么被折叠?



