所使用的包分别为commons-httpclient.jar和jsoup-1.6.1.jar;
利用jsoup提取,页面src路径;
利用 httpclient下载网站图片
01 | public class DownImages { | |
02 | private static int COUNT = 0; |
03 | private static int DOWN_COUNT = 0; | |
04 |
|
05 | public static void jsoupHTML(String urlPath) throws Exception{ |
06 | Document doc = Jsoup.connect(urlPath).timeout(1000000).get(); |
07 | //:当前页中的图片 | |
08 | Elements srcLinks = doc.select("img[src$=.jpg]"); |
09 | for (Element link : srcLinks) { | |
10 | //:剔除标签,只剩链接路径 |
11 | String imagesPath = link.attr("src"); | |
12 | System.out.println("当前访问路径:"+imagesPath); |
13 | getImages(imagesPath, "d://images//0000"+ ++COUNT +".jpg"); | |
14 | } |
15 |
| |
16 | //:提取网站中所有的href连接 |
17 | Elements linehrefs = doc.select("a[href]"); | |
18 |
|
19 | for (Element linehref : linehrefs) { | |
20 | String lihr = linehref.attr("href"); |
21 | if(lihr.length()>4){ | |
22 | String ht = lihr.substring(0, 4); |
23 | String htt = lihr.substring(0, 1); |
24 | if(!ht.equals("http") && htt.equals("/")){ |
25 | lihr = urlPath + lihr; | |
26 | } |
27 | if(lihr.substring(0, 4).equals("http")){ | |
28 | Document docs = Jsoup.connect(lihr).timeout(1000000).get(); |
29 | Elements links = docs.select("img[src$=.jpg]"); |
30 | for (Element link : links) { |
31 | //:剔除标签,只剩链接路径 |
32 | String imagesPath = link.attr("src"); |
33 | System.out.println("当前访问路径:"+imagesPath); | |
34 | getImages(imagesPath, "d://images//0000"+ COUNT++ +".jpg"); |
35 | } | |
36 | } |
37 | } | |
38 | } |
39 | } | |
40 |
|
41 |
| |
42 | /** |
43 | * @param urlPath 图片路径 | |
44 | * @throws Exception |
45 | */ | |
46 | public static void getImages(String urlPath,String fileName) throws Exception{ |
47 | URL url = new URL(urlPath);//:获取的路径 | |
48 | //:http协议连接对象 |
49 | HttpURLConnection conn = (HttpURLConnection) url.openConnection(); | |
50 | conn.setRequestMethod("GET"); |
51 | conn.setReadTimeout(6 * 10000); | |
52 | if (conn.getResponseCode() <10000){ |
53 | InputStream inputStream = conn.getInputStream(); | |
54 | byte[] data = readStream(inputStream); |
55 | if(data.length>(1024*10)){ | |
56 | FileOutputStream outputStream = new FileOutputStream(fileName); |
57 | outputStream.write(data); | |
58 | System.err.println("第"+ ++DOWN_COUNT +"图片下载成功"); |
59 | outputStream.close(); | |
60 | } |
61 | } | |
62 |
|
63 | } | |
64 |
|
65 | /** | |
66 | * 读取url中数据,并以字节的形式返回 |
67 | * @param inputStream | |
68 | * @return |
69 | * @throws Exception | |
70 | */ |
71 | public static byte[] readStream(InputStream inputStream) throws Exception{ |
72 | ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); |
73 | byte[] buffer = new byte[1024]; | |
74 | int len = -1; |
75 | while((len = inputStream.read(buffer)) !=-1){ | |
76 | outputStream.write(buffer, 0, len); |
77 | } | |
78 | outputStream.close(); |
79 | inputStream.close(); | |
80 | return outputStream.toByteArray(); |
81 | } | |
82 |
|
83 | public static void main(String[] args) { | |
84 | try { |
85 | String urlPath = "http://www.22mm.cc/"; | |
86 | jsoupHTML(urlPath); |
87 | } catch (Exception e) { | |
88 | e.printStackTrace(); |
89 | }finally{ | |
90 | System.out.println("共访问"+COUNT+"张图片,其中下载"+DOWN_COUNT+"张图片"); |
91 | } | |
92 | } |
93 | } |