对于网络爬虫,java用到的是java.net
抓取一般的数据:
首先用URL获取连接
URLConnection连接连接等于URL.openConnection()
运用输入字节流把把网站的编码下载读取出来
InputStream in = connection.getInputStream();
byte[] buf = new byte[1024];
int length = 0;
StringBuffer sb = new StringBuffer();
while ((length = in.read(buf, 0, buf.length)) > 0) {
sb.append(new String(buf, ECODING));
}
以上只是抓取数据一个网页的查看源代码,而不是下载整个图片
如何下载到图片,整体思路
1.得到图片的名称
2.获取到图片的连接地址
3.下载图片的链接地址
如何获取图片名称
在获取源代码时候,图片的表示方法为<img....src=“”>,通过正则表达式"<img.*src=(.*?)[^>]*?>"获取图片img ,从而可以截取出图片名称。
获取下载连接也只需要正则表达式"http:\"?(.*?)(\"|>|\\s+)"获取到img中的src的链接
根据图片连接地址现在图片,即使输入输出字节流!
注:比较正则表达式方法为
Matcher matcher = Pattern.compile(正则表达式).matcher(比较的字符);
判断是否存在相同的:matcher .find()
得到结构:matcher.group()
源程序:
public class zhuaqu5 {
private static final String URL = "http://shop73175232.taobao.com/category-702491284.htm?spm=a1z10.5.w5002-2030937079.7.RcH1w4&search=y&catName=%D5%EB%D6%AF%C9%C0";
// 编码
private static final String ECODING = "UTF-8";
// 获取img标签正则
private static final String IMGURL_REG = "<img.*src=(.*?)[^>]*?>";
// 获取src路径的正则
private static final String IMGSRC_REG = "http:\"?(.*?)(\"|>|\\s+)";
public static void main(String[] args) throws Exception {
zhuaqu5 cm = new zhuaqu5();
//获得html文本内容
String HTML = cm.getHTML(URL);
//获取图片标签
List<String> imgUrl = cm.getImageUrl(HTML);
//获取图片src地址
List<String> imgSrc = cm.getImageSrc(imgUrl);
//下载图片
cm.Download(imgSrc);
}
private String getHTML(String url) throws Exception {
URL uri = new URL(url);
URLConnection connection = uri.openConnection();
InputStream in = connection.getInputStream();
byte[] buf = new byte[1024];
int length = 0;
StringBuffer sb = new StringBuffer();
while ((length = in.read(buf, 0, buf.length)) > 0) {
sb.append(new String(buf, ECODING));
}
in.close();
return sb.toString();
}
private List<String> getImageUrl(String HTML) {
Matcher matcher = Pattern.compile(IMGURL_REG).matcher(HTML);
List<String> listImgUrl = new ArrayList<String>();
while (matcher.find()) {
System.out.println(matcher.group());
listImgUrl.add(matcher.group());
}
return listImgUrl;
}
private List<String> getImageSrc(List<String> listImageUrl) {
List<String> listImgSrc = new ArrayList<String>();
for (String image : listImageUrl) {
Matcher matcher = Pattern.compile(IMGSRC_REG).matcher(image);
while (matcher.find()) {
listImgSrc.add(matcher.group().substring(0, matcher.group().length() - 1));
}
}
return listImgSrc;
}
private void Download(List<String> listImgSrc) {
for (String url : listImgSrc) {
try {
String imageName = url.substring(url.lastIndexOf("/") + 1, url.length());
//System.out.println(imageName);
URL uri = new URL(url);
InputStream in = uri.openStream();
FileOutputStream fo = new FileOutputStream(new File(imageName));
byte[] buf = new byte[1024];
int length = 0;
System.out.println("开始下载:" + url);
while ((length = in.read(buf, 0, buf.length)) != -1) {
fo.write(buf, 0, length);
}
in.close();
fo.close();
System.out.println(imageName + "下载完成");
} catch (Exception e) {
System.out.println("下载失败");
}
}
}
}