jsoup html to text,Jsoup和htmlunit结合使用。

maven依赖:

org.jsoup

jsoup

1.11.3

net.sourceforge.htmlunit

htmlunit

2.40.0

代码:

package com.ybjdw.tool.utils;

import com.gargoylesoftware.htmlunit.BrowserVersion;

import com.gargoylesoftware.htmlunit.WebClient;

import com.gargoylesoftware.htmlunit.html.HtmlPage;

import org.apache.commons.logging.LogFactory;

import org.jsoup.Connection;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

import org.jsoup.nodes.Element;

import org.jsoup.select.Elements;

import org.slf4j.Logger;

import org.slf4j.LoggerFactory;

import java.io.File;

import java.util.logging.Level;

/**

* author: zhanggw

* 创建时间: 2020/7/23

*/

public class JsoupUtil {

private static Logger logger = LoggerFactory.getLogger(JsoupUtil.class);

public static void main(String[] args) {

try{

String shopName = "KaKa studios";

String mainUrl = "https://www.vvic.com";

String localDir = "D:/tmp";

String shopLink;

logger.debug("开始搜索店铺名称:{}", shopName);

// 搜索档口

Connection connect = Jsoup.connect("https://www.vvic.com/gz/shops/search.html?q="+shopName);

connect.timeout(10000);

Document document = connect.get();

Element element = document.selectFirst("div[id=stallContent] dl dd span[class=cell ctrl-cell] a");

shopLink = mainUrl + element.attr("href");

logger.debug("店铺链接: {}", shopLink);

// 获取档口详情

connect = Jsoup.connect(shopLink);

connect.data("sort","up_time-desc");

connect.data("currentPage", "1");

document = connect.get();

element = document.selectFirst("div[class=goods-list shop-list clearfix] ul");

Elements itemEleList = element.getElementsByTag("li");

// 构造一个webClient 模拟Chrome 浏览器

WebClient webClient = new WebClient(BrowserVersion.CHROME);

for(int i=1; i40){

break;

}

Element itemEle = itemEleList.get(i);

String href = itemEle.selectFirst("div[class=item] div[class=pic j-vct] a").attr("href");

String itemLink = mainUrl+href;

logger.debug("商品链接: {}", itemLink);

// 获取商品详情

//屏蔽日志信息

LogFactory.getFactory().setAttribute("org.apache.commons.logging.Log",

"org.apache.commons.logging.impl.NoOpLog");

java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF);

//支持JavaScript

webClient.getOptions().setJavaScriptEnabled(true);

webClient.getOptions().setCssEnabled(true);

webClient.getOptions().setActiveXNative(false);

webClient.getOptions().setThrowExceptionOnScriptError(false);

webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);

webClient.getOptions().setUseInsecureSSL(true);

webClient.getOptions().setTimeout(10000);

HtmlPage rootPage = webClient.getPage(itemLink);

//设置一个运行JavaScript的时间

webClient.waitForBackgroundJavaScript(5000);

String html = rootPage.asXml();

document = Jsoup.parse(html);

// 货号

Element productCodeEle = document.selectFirst("div[class=product-detail] dl[class=summary clearfix] div[class=value ff-arial]");

String productCode = productCodeEle.text().trim();

element = document.selectFirst("div[id=info] div[class=d-content]");

Elements imgEleList = element.getElementsByTag("img");

logger.debug("商品详情图如下:");

imgEleList.forEach(img->{

String url = img.attr("data-original");

String suffix = url.substring(url.lastIndexOf("."));

String localPath = localDir + "/" + shopName + "/" + productCode;

logger.debug("url:{}, suffix:{}, localPatch:{}", url, suffix, localPath);

File file = new File(localPath);

if(!file.exists()){

file.mkdirs();

}

logger.debug("开始下载:{},本地地址:{}", url, localPath+"/"+System.currentTimeMillis()+suffix);

FileUtil.downloadFileConcurrent(url, localPath+"/"+System.currentTimeMillis()+suffix);

});

}

}catch (Exception e){

logger.debug("爬取异常",e);

}

logger.debug("搜款网图片下载完毕!");

}

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值