java爬虫,无限下一页无限下载图片jsoup实现
package com.test;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
//爬瑟瑟
public class ReptileTest {
//定义路径
static String baseurl = "https://www.4kbizhi.com/";
static String geturl = "https://www.4kbizhi.com/";
static String filepath = "C:\\Users\\Administrator\\Desktop\\新建文件\\";
public static void main(String[] args) {
System.out.println("初始下载页面:"+baseurl);
String html = getHtml(baseurl); //html页面内容
List<String> srclists = getImgSrcListFromHtml(html); //图片地址集合
downloadImg(srclists, filepath); //下载图片
do{
String htmlnext = getHtml(geturl);//下一页的网页内容
//获取下一个页面进行下载
String next = nextpage(htmlnext);
if (next==null){
break;
}
System.out.println("下一个下载页面:"+next);
String html2 = getHtml(next); //html页面内容
List<String> srclists2 = getImgSrcListFromHtml(html2); //图片地址集合
downloadImg(srclists2, filepath); //下载图片
}while (true);
System.out.println("下载完毕");
}
//动态下一页
public static String nextpage(String html){
String next=null;
//解析成html页面
Document document = Jsoup.parse(html);
//获取目标
Elements elements = document.select(".next");
//System.out.println("测试elements:"+elements);
if (elements.get(elements.size()-1).attr("href")!=null) {
next = baseurl + elements.get(elements.size() - 1).attr("href");
System.out.println("测试next:"+next);
}
geturl=next;
System.out.println("测试geturl:"+geturl);
return next;
}
/**
*
* @Title: getImgSrcListFromHtml
* @Description: 获取页面内容图片路径
* @param @param html 页面内容
* @param @return 图片路径数组
* @return ArrayList<String> 返回类型
* @throws
*/
public static List<String> getImgSrcListFromHtml(String html){
List<String> list = new ArrayList<>();
//解析成html页面
Document document = Jsoup.parse(html);
//获取目标
Elements elements = document.select("ul[class=item] > li > a").select("img");
//System.out.println("测试elements:"+elements);
int len = elements.size();
for (int i = 0; i < len; i++) {
list.add(baseurl+elements.get(i).attr("src"));
//System.out.println("测试list:"+list);
}
return list;
}
/**
*
* @Title: getHtml
* @Description: 获取页面内容
* @param @param url
* @param @return 页面内容
* @return String 返回类型
* @throws
*/
public static String getHtml(String url){
String html = "";
try {
html = Jsoup.connect(url).execute().body();
} catch (IOException e) {
e.printStackTrace();
}
return html;
}
/**
*
* @Title: downloadImg
* @Description: 下载图片 -- 通过获取的流转成byte[]数组,再通过FileOutputStream写出
* @param @param list 图片路径数组
* @param @param filepath 保存文件夹位置
* @return void 返回类型
* @throws
*/
public static void downloadImg(List<String> list, String filepath){
String myfilepath;
URL newUrl = null;
HttpURLConnection hconnection = null;
InputStream inputStream = null;
FileOutputStream fileOutputStream = null;
byte[] bs = null;
try {
int len = list.size();
for (int i = 0; i < len; i++) {
newUrl = new URL(list.get(i));
hconnection = (HttpURLConnection) newUrl.openConnection(); //打开连接
inputStream = hconnection.getInputStream(); //获取流
bs = getBytesFromInputStream(inputStream); //流转btye[]
myfilepath = filepath + list.get(i).substring(list.get(i).lastIndexOf("/")+1); //获取图片名称
System.out.println("生成图片路径:"+myfilepath);
fileOutputStream = new FileOutputStream(new File(myfilepath));
fileOutputStream.write(bs); //写出
}
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
inputStream.close();
fileOutputStream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
*
* @Title: getBytesFromInputStream
* @Description: InputStream流转换byte[]
* @param @param inputStream
* @param @return byte[]
* @return byte[] 返回类型
* @throws
*/
public static byte[] getBytesFromInputStream(InputStream inputStream){
byte[] bs = null;
try {
byte[] buffer = new byte[1024];
int len = 0;
ByteArrayOutputStream arrayOutputStream = new ByteArrayOutputStream();
while((len = inputStream.read(buffer)) != -1){
arrayOutputStream.write(buffer, 0 ,len);
}
bs = arrayOutputStream.toByteArray();
} catch (IOException e) {
e.printStackTrace();
}
return bs;
}
}





