【转】JTidy转换html到xml 代码

 

JTidy转换htmlxml
方法一:现无法解决乱码
package spide;
import java.io.PrintWriter;
import java.io.FileInputStream;
import java.io.IOException;
import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.tidy.Tidy;
import java.io.FileOutputStream;
/**
* A sample DOM writer. This sample program illustrates how to
* traverse a DOM tree in order to print a document that is parsed.
*
*/
public class TestDOM {
protected PrintWriter out;
public TestDOM() {
try
{
FileOutputStream outxml=new FileOutputStream("D:/test.xml");
out = new PrintWriter(outxml);
}
catch(Exception e)
{
  e.printStackTrace();
 }
}
/** Prints the specified node, recursively. */
public void print(Node node) {
if ( node == null ) {
return;
}
int type = node.getNodeType();
switch ( type ) {
case Node.DOCUMENT_NODE:
out.println("<?xml version=\"1.0\" encoding=\"GBK\"?>");
print(((Document)node).getDocumentElement());
out.flush();
break;
case Node.ELEMENT_NODE:
out.print('<');
out.print(node.getNodeName());
NamedNodeMap attrs = node.getAttributes();
for ( int i = 0; i < attrs.getLength(); i++ ) {
out.print(' ');
out.print(attrs.item(i).getNodeName());
out.print("=\"");
out.print(attrs.item(i).getNodeValue());
out.print('"');
}
out.print('>');
out.println(); // HACK
NodeList children = node.getChildNodes();
if ( children != null ) {
int len = children.getLength();
for ( int i = 0; i < len; i++ ) {
print(children.item(i));
}
}
break;
case Node.TEXT_NODE:
out.print(node.getNodeValue());
break;
}
if ( type == Node.ELEMENT_NODE ) {
out.print("</");
out.print(node.getNodeName());
out.print('>');
out.println(); // HACK
}
out.flush();
}
public static void main(String args[]) {
 String conf="D:/tidy.properties";
FileInputStream in;
 
Tidy tidy = new Tidy();
tidy.setConfigurationFromFile(conf);
TestDOM t = new TestDOM();
try {
in = new FileInputStream("D:/speed.html");
tidy.setMakeClean(true);
tidy.setXmlTags(true);
t.print(tidy.parseDOM(in, null));
}
catch ( IOException e ) {
System.err.println( e.toString() );
}
}
}
?
方法二:可以解决乱码,解析时出现 White spaces are required between publicId and systemId错误
package spide;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import org.w3c.tidy.Tidy;
public class Test17 implements Runnable {
    private String srcFileName;
    private String outFileName;
    private String errOutFileName;
    private String configFileName;
    public Test17(String srcFileName, String outFileName,??? String confName) {
          this.srcFileName = srcFileName;
          this.outFileName = outFileName;
          this.configFileName= confName;
     }
      public void run() {
          BufferedInputStream in;
          FileOutputStream out;
          Tidy tidy = new Tidy();
     tidy.setConfigurationFromFile(configFileName);
     try {
       // tidy.setErrout(new PrintWriter(new FileWriter(errOutFileName), true));
                in = new BufferedInputStream(new FileInputStream(srcFileName));
                out = new FileOutputStream(outFileName);
               String head = "<?xml version=\"1.0\" encoding=\"GBK\"?>";
               byte[] bytes = head.getBytes();
               out.write(bytes, 0, bytes.length);

           tidy.parse(in, out);
           } catch (IOException e) {
                    System.out.println(this.toString() + e.toString());
          }
      }
     public static void main(String[] args) {
     String src="D:/speed.html";
     String out="D:/result.xml";
     String err="D:/err.txt";
     String conf="D:/tidy.properties";
          Test17 t1 = new Test17(src,out,conf);
           Thread th1 = new Thread(t1);
           th1.start();
      }
}

     

 

转载于:https://www.cnblogs.com/xiaoman_890/archive/2009/01/18/1377935.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值