maven依赖:
org.jsoup
jsoup
1.11.3
net.sourceforge.htmlunit
htmlunit
2.40.0
代码:
package com.ybjdw.tool.utils;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import org.apache.commons.logging.LogFactory;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.util.logging.Level;
/**
* author: zhanggw
* 创建时间: 2020/7/23
*/
public class JsoupUtil {
private static Logger logger = LoggerFactory.getLogger(JsoupUtil.class);
public static void main(String[] args) {
try{
String shopName = "KaKa studios";
String mainUrl = "https://www.vvic.com";
String localDir = "D:/tmp";
String shopLink;
logger.debug("开始搜索店铺名称:{}", shopName);
// 搜索档口
Connection connect = Jsoup.connect("https://www.vvic.com/gz/shops/search.html?q="+shopName);
connect.timeout(10000);
Document document = connect.get();
Element element = document.selectFirst("div[id=stallContent] dl dd span[class=cell ctrl-cell] a");
shopLink = mainUrl + element.attr("href");
logger.debug("店铺链接: {}", shopLink);
// 获取档口详情
connect = Jsoup.connect(shopLink);
connect.data("sort","up_time-desc");
connect.data("currentPage", "1");
document = connect.get();
element = document.selectFirst("div[class=goods-list shop-list clearfix] ul");
Elements itemEleList = element.getElementsByTag("li");
// 构造一个webClient 模拟Chrome 浏览器
WebClient webClient = new WebClient(BrowserVersion.CHROME);
for(int i=1; i40){
break;
}
Element itemEle = itemEleList.get(i);
String href = itemEle.selectFirst("div[class=item] div[class=pic j-vct] a").attr("href");
String itemLink = mainUrl+href;
logger.debug("商品链接: {}", itemLink);
// 获取商品详情
//屏蔽日志信息
LogFactory.getFactory().setAttribute("org.apache.commons.logging.Log",
"org.apache.commons.logging.impl.NoOpLog");
java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF);
//支持JavaScript
webClient.getOptions().setJavaScriptEnabled(true);
webClient.getOptions().setCssEnabled(true);
webClient.getOptions().setActiveXNative(false);
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
webClient.getOptions().setUseInsecureSSL(true);
webClient.getOptions().setTimeout(10000);
HtmlPage rootPage = webClient.getPage(itemLink);
//设置一个运行JavaScript的时间
webClient.waitForBackgroundJavaScript(5000);
String html = rootPage.asXml();
document = Jsoup.parse(html);
// 货号
Element productCodeEle = document.selectFirst("div[class=product-detail] dl[class=summary clearfix] div[class=value ff-arial]");
String productCode = productCodeEle.text().trim();
element = document.selectFirst("div[id=info] div[class=d-content]");
Elements imgEleList = element.getElementsByTag("img");
logger.debug("商品详情图如下:");
imgEleList.forEach(img->{
String url = img.attr("data-original");
String suffix = url.substring(url.lastIndexOf("."));
String localPath = localDir + "/" + shopName + "/" + productCode;
logger.debug("url:{}, suffix:{}, localPatch:{}", url, suffix, localPath);
File file = new File(localPath);
if(!file.exists()){
file.mkdirs();
}
logger.debug("开始下载:{},本地地址:{}", url, localPath+"/"+System.currentTimeMillis()+suffix);
FileUtil.downloadFileConcurrent(url, localPath+"/"+System.currentTimeMillis()+suffix);
});
}
}catch (Exception e){
logger.debug("爬取异常",e);
}
logger.debug("搜款网图片下载完毕!");
}
}