准备
选取适合的网站,使用Jsoup获取网页DOM元素。
目标网站:http://www.win4000.com/zt/meinv.html
抓取美女图
使用SpringBoot新建一个工程。添加依赖
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
| <dependencies> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-web</artifactId> </dependency>
<dependency> <groupId>org.projectlombok</groupId> <artifactId>lombok</artifactId> <optional>true</optional> </dependency> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-test</artifactId> <scope>test</scope> </dependency> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.11.3</version> </dependency> <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> <version>4.5.7</version> </dependency>
</dependencies>
|
打开网页
检查元素查看网页的结构,查找图片所在的DOM元素节点。

a 标签的所在的DOM节点为
.Left_bar .tab_tj .tab_box ul li a
使用Jsoup解析网页,
- 第一步: 获取当前页所有图集的href值
- 第二步:获取图集中的img src属性值
- 第三步:使用工具类下载图片
代码如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
| import com.felix.project.util.FileUtils; import com.felix.project.util.HttpClientUtils; import lombok.extern.slf4j.Slf4j; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.springframework.stereotype.Component;
import java.io.IOException;
@Slf4j @Component public class CrawlPage { private static final String HTTPURL = "http://www.win4000.com/zt/meinv_1.html"; private static final String FILEPATH="E:/jsoup/images/"; public static void crawlImg() throws IOException { int count=0; log.info("开始抓取 --> 目标地址:"+HTTPURL); String html = HttpClientUtils.getHtml(HTTPURL, "utf-8"); Document doc = Jsoup.parse(html); Elements aImgElement = doc.select(".Left_bar .tab_tj .tab_box ul li a"); for (Element a : aImgElement) { String href = a.select("a[href]").attr("href"); int index=aImgElement.indexOf(a)+1;
String url = href.split(".html")[0]; log.info("获取第"+index+"个图集"); int page = Integer.parseInt(Jsoup.parse(HttpClientUtils.getHtml(href, "utf-8")).select(".Bigimg .ptitle em").text()); int total=0; for (int i = 1; i <=page ; i++) { String imgHtml = HttpClientUtils.getHtml(url+"_"+i+".html", "utf-8"); Document imgDoc = Jsoup.parse(imgHtml); Elements select = imgDoc.select(".pic_main .col-main .main-wrap .pic-meinv a"); String src = select.select("img").attr("src"); String title = select.select("img").attr("title"); FileUtils.downloadImage(src,FILEPATH+title,System.currentTimeMillis()+".jpg"); log.info("下载第--"+index+"--"+title+"--个图集的第"+i+"张图片"); count++; total++; } log.info("第-"+index +"-个图集共有"+total+"张图片"); } log.info("本次获取"+count+"图片"); } }
|
工具类
FileUtils 图片下载
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
| import java.io.*; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection;
public class FileUtils {
public static void downloadImage(String url,String filePath ,String fileName){
try { URL u = new URL(url); URLConnection connection = u.openConnection(); connection.setReadTimeout(60000); connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:50.0) Gecko/20100101 Firefox/50.0"); InputStream inputStream = connection.getInputStream();
File dir = new File(filePath); if (!dir.exists()) { dir.mkdirs(); }
File file = new File(filePath + File.separator + fileName);
BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(file));
byte[] buf = new byte[1024];
int size; while (-1 != (size = inputStream.read(buf))) { out.write(buf, 0, size); }
out.close(); inputStream.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } }
|
HttpClientUtils 获取网页
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
| import org.apache.http.HttpEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils;
import java.io.IOException;
public class HttpClientUtils {
public static String getHtml(String url,String charset) throws IOException{ CloseableHttpClient client = HttpClients.createDefault(); HttpGet httpGet = new HttpGet(url); httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:50.0) Gecko/20100101 Firefox/50.0");
CloseableHttpResponse response = client.execute(httpGet); HttpEntity entity = response.getEntity();
String html = EntityUtils.toString(entity, charset);
response.close(); client.close(); return html; } }
|
参考
https://www.open-open.com/jsoup/example-list-links.htm