package com.util.md5;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import com.sun.corba.se.spi.orbutil.fsm.State;
/**
* 用HTMLparser抓取img标签的路径
* @author Administrator token@token.com
* @param content:要爬取的文本
* @return 返回SRC路径改为服务器路径的文本
* @throws Exception:io异常
*/
public class Htmlparserutil {
/**
* example : new Htmlparserutil().parserto("<img src = 'www.baidu.com'/><a><a/>","D:\test");
* @param contentString 要取img标签的内容
* @param pathString 要存到服务器的路径
*/
public static String parserto(String contentString,String pathString,String dispPath) {
String stringBuffer=null;
try {
List list = new ArrayList();
Parser parser = Parser.createParser(contentString, "UTF-8");
NodeList nodeList = parser
.extractAllNodesThatMatch(new NodeFilter() {
public boolean accept(Node node) {
if (node instanceof ImageTag)// <img>标记
return true;
return false;
}
});
for (int i = 0; i < nodeList.size(); i++) {
ImageTag n = (ImageTag) nodeList.elementAt(i);
list.add(n.getImageURL());
int start = 0;
start = n.getImageURL().lastIndexOf("/");
String picname=n.getImageURL().substring(start,n.getImageURL().length());
contentString=contentString.replace(n.getImageURL(), dispPath+picname);
///////////////?????????
if(n.getImageURL().indexOf("http://e.huisou.com")==1){
contentString.replace("http://e.huisou.com","http://img.e.huisou.com");
}
}
List<String> listImg = new ArrayList<String>();
listImg.add(".jpg");
listImg.add(".JPG");
listImg.add(".jpeg");
listImg.add(".JPEG");
listImg.add(".bmp");
listImg.add(".BMP");
listImg.add(".gif");
listImg.add(".GIF");
listImg.add(".png");
listImg.add(".PNG");
Iterator ite = list.iterator();
while (ite.hasNext()) {
String content = (String) ite.next();
for (int i = 0; i < listImg.size(); i++) {
if (content.contains(listImg.get(i))) {
content = content.substring(0, content.indexOf(listImg
.get(i))
+ listImg.get(i).length());
}
}
URL u = new URL(content);
URLConnection uc = u.openConnection();
InputStream in = uc.getInputStream();
String name = content.substring(content.lastIndexOf("/") + 1,
content.length());
OutputStream out = new FileOutputStream(pathString + name);
byte[] buffer = new byte[1024];
while (in.read(buffer) > 0) {
out.write(buffer);
}
out.flush();
out.close();
in.close();
}
} catch (Exception e) {
}
return contentString;
}
/**
* 替换文本标签迭代
* @param contentString
* @return
* @throws ParserException
*/
public static String parserto(String contentString) throws ParserException {
try {
Parser parser = Parser.createParser(contentString, "UTF-8");
NodeList nodeList = parser
.extractAllNodesThatMatch(new NodeFilter() {
public boolean accept(Node node) {
if (node instanceof LinkTag)//
return true;
return false;
}
});
for (int i = 0; i < nodeList.size(); i++) {
LinkTag n = (LinkTag) nodeList.elementAt(i);
//
contentString=contentString.replace(n.toHtml().toString(),n.getLinkText());
}
} catch (Exception e) {
}
return contentString;
}
public static String updateurl(String contentString) {
Parser parser = Parser.createParser(contentString, "UTF-8");
NodeList nodeList = null;
try {
nodeList = parser
.extractAllNodesThatMatch(new NodeFilter() {
public boolean accept(Node node) {
if (node instanceof ImageTag)
return true;
return false;
}
});
} catch (ParserException e) {
e.printStackTrace();
}
for (int i = 0; i < nodeList.size(); i++) {
ImageTag n = (ImageTag) nodeList.elementAt(i);
if (n.getImageURL().indexOf("http://e.huisou.com")==1) {
System.out.println("start");
contentString=contentString.replace(n.getImageURL().substring("http://e.huisou.com".length()),"http://img.e.huisou.com");
System.out.println("end");
}else{
System.out.println("为找到外网的图片");
}
}
return contentString;
}
}