jsoup提取连接下载网站图片

最新推荐文章于 2024-09-02 16:20:11 发布

tfy1332

最新推荐文章于 2024-09-02 16:20:11 发布

阅读量2.8k

点赞数 1

CC 4.0 BY-SA版权

本文链接：https://blog.youkuaiyun.com/tfy1332/article/details/21563079

本文介绍使用jsoup和httpclient进行网页图片抓取的方法。通过解析HTML文档选择包含图片链接的元素，并下载指定格式的图片到本地。同时，递归地访问网页中的链接以扩大搜索范围。

jsoup提取连接下载网站图片

所使用的包分别为commons-httpclient.jar和jsoup-1.6.1.jar；
利用jsoup提取，页面src路径；
利用 httpclient下载网站图片

1. [代码]jsoup提取src路径，下载网站图片

01	public class DownImages {
02	private static int COUNT = 0;

03	private static int DOWN_COUNT = 0;
04

05	public static void jsoupHTML(String urlPath) throws Exception{
06	Document doc = Jsoup.connect(urlPath).timeout(1000000).get();

07	//:当前页中的图片
08	Elements srcLinks = doc.select("img[src$=.jpg]");

09	for (Element link : srcLinks) {
10	//:剔除标签，只剩链接路径

11	String imagesPath = link.attr("src");
12	System.out.println("当前访问路径:"+imagesPath);

13	getImages(imagesPath, "d://images//0000"+ ++COUNT +".jpg");
14	}

15
16	//:提取网站中所有的href连接

17	Elements linehrefs = doc.select("a[href]");
18

19	for (Element linehref : linehrefs) {
20	String lihr = linehref.attr("href");

21	if(lihr.length()>4){
22	String ht = lihr.substring(0, 4);

23	String htt = lihr.substring(0, 1);
24	if(!ht.equals("http") && htt.equals("/")){

25	lihr = urlPath + lihr;
26	}

27	if(lihr.substring(0, 4).equals("http")){
28		Document docs = Jsoup.connect(lihr).timeout(1000000).get();

29	Elements links = docs.select("img[src$=.jpg]");
30	for (Element link : links) {

31	//:剔除标签，只剩链接路径
32	String imagesPath = link.attr("src");

33		System.out.println("当前访问路径:"+imagesPath);
34	getImages(imagesPath, "d://images//0000"+ COUNT++ +".jpg");

35	}
36	}

37	}
38	}

39	}
40

41
42	/**

43	* @param urlPath 图片路径
44	* @throws Exception

45	*/
46	public static void getImages(String urlPath,String fileName) throws Exception{

47	URL url = new URL(urlPath);//：获取的路径
48	//:http协议连接对象

49	HttpURLConnection conn = (HttpURLConnection) url.openConnection();
50	conn.setRequestMethod("GET");

51	conn.setReadTimeout(6 * 10000);
52	if (conn.getResponseCode() <10000){

53	InputStream inputStream = conn.getInputStream();
54	byte[] data = readStream(inputStream);

55	if(data.length>(1024*10)){
56	FileOutputStream outputStream = new FileOutputStream(fileName);

57	outputStream.write(data);
58	System.err.println("第"+ ++DOWN_COUNT +"图片下载成功");

59	outputStream.close();
60	}

61	}
62

63	}
64

65	/**
66	* 读取url中数据，并以字节的形式返回

67	* @param inputStream
68	* @return

69	* @throws Exception
70	*/

71	public static byte[] readStream(InputStream inputStream) throws Exception{
72	ByteArrayOutputStream outputStream = new ByteArrayOutputStream();

73	byte[] buffer = new byte[1024];
74	int len = -1;

75	while((len = inputStream.read(buffer)) !=-1){
76	outputStream.write(buffer, 0, len);

77	}
78	outputStream.close();

79	inputStream.close();
80	return outputStream.toByteArray();

81	}
82

83	public static void main(String[] args) {
84	try {

85	String urlPath = "http://www.22mm.cc/";
86	jsoupHTML(urlPath);

87	} catch (Exception e) {
88	e.printStackTrace();

89	}finally{
90	System.out.println("共访问"+COUNT+"张图片，其中下载"+DOWN_COUNT+"张图片");

91	}
92	}

}