HtmlUnit-JavaScript爬坑
源码地址:https://download.youkuaiyun.com/download/NaXieNianYiShiGuJi/15729089
maven设置
直接贴代码
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.test.htmlunit</groupId>
<artifactId>htm-unit</artifactId>
<version>0.0.1-SNAPSHOT</version>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<maven.compiler.encoding>UTF-8</maven.compiler.encoding>
</properties>
<build>
<finalName>http-utils</finalName>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>2.10.3</version>
<configuration>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
<version>2.7</version>
<configuration>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.5.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
</dependency>
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.47.1</version>
<!-- <version>2.29</version> -->
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.8.3</version>
</dependency>
<dependency>
<groupId>dom4j</groupId>
<artifactId>dom4j</artifactId>
<version>1.6.1</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.4</version>
</dependency>
<dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
<version>1.9</version>
</dependency>
<dependency>
<groupId>commons-digester</groupId>
<artifactId>commons-digester</artifactId>
<version>2.0</version>
<exclusions>
<exclusion>
<artifactId>commons-beanutils</artifactId>
<groupId>commons-beanutils</groupId>
</exclusion>
<exclusion>
<artifactId>commons-logging</artifactId>
<groupId>commons-logging</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>commons-beanutils</groupId>
<artifactId>commons-beanutils</artifactId>
<version>1.9.2</version>
<exclusions>
<exclusion>
<artifactId>commons-logging</artifactId>
<groupId>commons-logging</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>commons-fileupload</groupId>
<artifactId>commons-fileupload</artifactId>
<version>1.3.1</version>
<exclusions>
<exclusion>
<artifactId>commons-io</artifactId>
<groupId>commons-io</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.5</version>
</dependency>
<dependency>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
<version>1.2</version>
</dependency>
<dependency>
<groupId>commons-httpclient</groupId>
<artifactId>commons-httpclient</artifactId>
<version>3.1</version>
<exclusions>
<exclusion>
<artifactId>commons-codec</artifactId>
<groupId>commons-codec</groupId>
</exclusion>
<exclusion>
<artifactId>commons-logging</artifactId>
<groupId>commons-logging</groupId>
</exclusion>
</exclusions>
</dependency>
</dependencies>
</project>
常规方法介绍
直接贴代码,注释很详细,上手很轻松
public static void main(String[] args) throws Exception {
WebClient webClient = new WebClient(BrowserVersion.CHROME);//新建一个模拟谷歌Chrome浏览器的浏览器客户端对象
webClient.getOptions().setThrowExceptionOnScriptError(false);//当JS执行出错的时候是否抛出异常, 这里选择不需要
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);//当HTTP的状态非200时是否抛出异常, 这里选择不需要
webClient.getOptions().setActiveXNative(false);
//webClient.getOptions().setCssEnabled(false);//是否启用CSS, 因为不需要展现页面, 所以不需要启用
webClient.getOptions().setCssEnabled(false);//是否启用CSS, 因为不需要展现页面, 所以不需要启用
webClient.getOptions().setJavaScriptEnabled(true); //很重要,启用JS
webClient.setAjaxController(new NicelyResynchronizingAjaxController());//很重要,设置支持AJAX
try {
// 拿到这个网页
HtmlPage page = webClient.getPage("https://www.baidu.com");
HtmlInput username = (HtmlInput)page.getElementById("kw");
username.type("李晟");
HtmlInput su = (HtmlInput)page.getElementById("su");
//input点击
HtmlPage nextPage = su.click();
webClient.waitForBackgroundJavaScript(10000);//异步JS执行需要耗时,所以这里线程要阻塞30秒,等待异步JS执行结束
System.out.println(nextPage.asXml());
} catch (Exception e) {
e.printStackTrace();
} finally {
webClient.close();
}
}
调用页面JavaScript方法,贴代码
//简单function
String javaScriptCode = "funtion1('235','234')";
ScriptResult result = nextPage.executeJavaScript(javaScriptCode);
//webClient.waitForBackgroundJavaScript(3000);//异步JS执行需要耗时,所以这里线程要阻塞30秒,等待异步JS执行结束
//如果是ajax,则还需重新请求
HtmlPage nextPage2=(HtmlPage)result.getNewPage();
System.out.println(nextPage2.asXml());
是不是感觉很简单,当然调用JavaScript方法需要对页面源码有一定阅读能力
爬坑
好了,简单入手就到这里了,下面来个挑战
需求如下,打开头条搜索,搜索关键词中国新闻网,点击用户,点击中国新闻网
从如图看,我们分解下步骤
1、我们可以直接访问头条搜索地址https://www.toutiao.com/search/?keyword=中国新闻网
2、点击用户
3、点击中国新闻网
直接贴代码
try {
// 由于头条是https需要设置https支持
webClient.getOptions().setUseInsecureSSL(true);
//不想看javascript报错设置如下
webClient.getOptions().setThrowExceptionOnScriptError(false);
JavaScriptEngine engine = (JavaScriptEngine)webClient.getJavaScriptEngine();
engine.holdPosponedActions();
HtmlPage page = webClient.getPage("https://www.toutiao.com/search/?keyword=中国新闻网");
webClient.waitForBackgroundJavaScript(10000);//异步JS执行需要耗时,所以这里线程要阻塞30秒,等待异步JS执行结束
//快速查找<div class="tabBar"><ul><li>综合</li><li>视频</li><li>用户</li></ul></div>
List<HtmlListItem> tabli=page.getByXPath("//div[@class='tabBar']/ul/li[3]");
HtmlPage page2=tabli.get(0).click();
webClient.waitForBackgroundJavaScript(10000);//异步JS执行需要耗时,所以这里线程要阻塞30秒,等待异步JS执行结束
//System.out.println(page2.asXml());
//dom快速解析
Document doc = Jsoup.parse(page2.asXml());
Element element = doc.select("div[class=tabBar]").first();
System.out.println(":::::::::::::::::::::"+element.html());
} catch (Exception e) {
e.printStackTrace();
} finally {
webClient.close();
}
代码中有段doc.select("div[class=tabBar]"),附下jsoup的select说明
跑个代码看看,满怀期待
没想着一次搞定,果然我们拿不到搜索的内容
看看先看看浏览器的网络
再在控制台找找,看有没有走这个ajax请求
控制台有这个ajax请求的链接,那为啥我们没有拿到信息呢,我们再看看控制台
原来头条的js经过混淆压缩了,导致htmlunit无法加载内容
想想其他办法,既然ajax请求链接在控制台有打印,那我们想办法拿到ajax请求链接不就好了吗
看看源码
貌似没有啥接口开放啊,不过没关系,java么, 既然有这个东西,那我重写下呗,没有我拿不到的东西
package com.lz.test;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebRequest;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
public class MyNicelyResynchronizingAjaxController extends NicelyResynchronizingAjaxController {
public WebRequest ajaxSetting;
public WebRequest getAjaxSetting() {
return ajaxSetting;
}
public void setAjaxSetting(WebRequest ajaxSetting) {
this.ajaxSetting = ajaxSetting;
}
@Override
public boolean processSynchron(final HtmlPage page, final WebRequest settings, final boolean async) {
ajaxSetting = settings;
super.processSynchron(page, settings, async);
return !async;
}
}
把webClient.setAjaxController(new NicelyResynchronizingAjaxController());换成我的,顺便对页面加一下错误监听,直接贴代码
public static void main(String[] args) throws Exception {
WebClient webClient = new WebClient(BrowserVersion.CHROME);//新建一个模拟谷歌Chrome浏览器的浏览器客户端对象
webClient.getOptions().setThrowExceptionOnScriptError(false);//当JS执行出错的时候是否抛出异常, 这里选择不需要
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);//当HTTP的状态非200时是否抛出异常, 这里选择不需要
webClient.getOptions().setActiveXNative(false);
//webClient.getOptions().setCssEnabled(false);//是否启用CSS, 因为不需要展现页面, 所以不需要启用
webClient.getOptions().setCssEnabled(false);//是否启用CSS, 因为不需要展现页面, 所以不需要启用
webClient.getOptions().setJavaScriptEnabled(true); //很重要,启用JS
MyNicelyResynchronizingAjaxController myajax=new MyNicelyResynchronizingAjaxController();
webClient.setAjaxController(myajax);//很重要,设置支持AJAX
//webClient.setAjaxController(new NicelyResynchronizingAjaxController());//很重要,设置支持AJAX
/**
try {
// 拿到这个网页
HtmlPage page = webClient.getPage("https://www.baidu.com");
HtmlInput username = (HtmlInput)page.getElementById("kw");
username.type("李晟");
HtmlInput su = (HtmlInput)page.getElementById("su");
HtmlPage nextPage = su.click();
webClient.waitForBackgroundJavaScript(10000);//异步JS执行需要耗时,所以这里线程要阻塞30秒,等待异步JS执行结束
System.out.println(nextPage.asXml());
//简单function
String javaScriptCode = "funtion1('235','234')";
ScriptResult result = nextPage.executeJavaScript(javaScriptCode);
//webClient.waitForBackgroundJavaScript(3000);//异步JS执行需要耗时,所以这里线程要阻塞30秒,等待异步JS执行结束
//如果是ajax,则还需重新请求
HtmlPage nextPage2=(HtmlPage)result.getNewPage();
System.out.println(nextPage2.asXml());
} catch (Exception e) {
e.printStackTrace();
} finally {
webClient.close();
}
*/
try {
// 由于头条是https需要设置https支持
webClient.getOptions().setUseInsecureSSL(true);
//不想看javascript报错设置如下
//webClient.getOptions().setThrowExceptionOnScriptError(false);
//javascript报错监听
webClient.setJavaScriptErrorListener(new JavaScriptErrorListener() {
@Override
public void warn(String message, String sourceName, int line, String lineSource, int lineOffset) {
// TODO Auto-generated method stub
System.out.println("warn");
}
@Override
public void timeoutError(HtmlPage page, long allowedTime, long executionTime) {
// TODO Auto-generated method stub
System.out.println("timeoutError");
}
@Override
public void scriptException(HtmlPage page, ScriptException scriptException) {
// TODO Auto-generated method stub
System.out.println("scriptException"+page.getUrl());
}
@Override
public void malformedScriptURL(HtmlPage page, String url, MalformedURLException malformedURLException) {
// TODO Auto-generated method stub
System.out.println("malformedScriptURL"+url);
}
@Override
public void loadScriptError(HtmlPage page, URL scriptUrl, Exception exception) {
// TODO Auto-generated method stub
System.out.println("loadScriptError"+scriptUrl);
}
});
JavaScriptEngine engine = (JavaScriptEngine)webClient.getJavaScriptEngine();
engine.holdPosponedActions();
HtmlPage page = webClient.getPage("https://www.toutiao.com/search/?keyword=中国新闻网");
webClient.waitForBackgroundJavaScript(10000);//异步JS执行需要耗时,所以这里线程要阻塞30秒,等待异步JS执行结束
//快速查找<div class="tabBar"><ul><li>综合</li><li>视频</li><li>用户</li></ul></div>
List<HtmlListItem> tabli=page.getByXPath("//div[@class='tabBar']/ul/li[3]");
HtmlPage page2=tabli.get(0).click();
webClient.waitForBackgroundJavaScript(10000);//异步JS执行需要耗时,所以这里线程要阻塞30秒,等待异步JS执行结束
System.out.println(page2.asXml());
//dom快速解析
Document doc = Jsoup.parse(page2.asXml());
Element element = doc.select("div[class=sections]").first();
System.out.println(":::::::::::::::::::::"+element.html());
//拿到ajax请求地址
WebRequest ajaxSetting = myajax.getAjaxSetting();
System.out.println("ajax=============="+ajaxSetting.getUrl());
List<HtmlAnchor> div_a=page2.getByXPath("//div[@class='sections']/div[1]/a");
//System.out.println(div_a.get(0).toString());
HtmlPage page3=div_a.get(0).click();
webClient.waitForBackgroundJavaScript(10000);//异步JS执行需要耗时,所以这里线程要阻塞30秒,等待异步JS执行结束
System.out.println(page3.asXml());
} catch (Exception e) {
e.printStackTrace();
} finally {
webClient.close();
}
}
代码跑起来,看看控制台, 结果如下,至此问题解决,
源码地址:https://download.youkuaiyun.com/download/NaXieNianYiShiGuJi/15729089
源码地址:https://download.youkuaiyun.com/download/NaXieNianYiShiGuJi/15729089
源码地址:https://download.youkuaiyun.com/download/NaXieNianYiShiGuJi/15729089