1、简介
htmlunit 是一款开源的java 页面分析工具,读取页面后,可以有效的使用htmlunit分析页面上的内容。项目可以模拟浏览器运行,被誉为java浏览器的开源实现。是一个没有界面的浏览器,运行速度迅速。是junit的扩展之一
2、官方API文档
HtmlUnit官网API文档
3、pom文件
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.26</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.21</version>
</dependency>
4、测试代码
package com.sun.htmlunit;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.DomElement;
import com.gargoylesoftware.htmlunit.html.DomNodeList;
import com.gargoylesoftware.htmlunit.html.HtmlDivision;
import com.gargoylesoftware.htmlunit.html.HtmlForm;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.html.HtmlSubmitInput;
import com.gargoylesoftware.htmlunit.html.HtmlTextInput;
/**
* htmlunit测试
* @author sunt
* @dade 2017年4月17日上午11:04:22
* @version v1.0
*/
public class HtmlUnitTestDemo {
/**
* 日志
*/
private static Logger logger = LoggerFactory.getLogger(HtmlUnitTestDemo.class);
/**
* htmlunit入门测试
*/
@Test
public void test1() {
//实例化web客户端
WebClient client = new WebClient();
try {
//解析请求url页面
HtmlPage page = client.getPage("http://blog.youkuaiyun.com/u010427935");
logger.info("===========>获取请求页面的html:" + page.asXml());
logger.info("=============>获取请求页面内容:" + page.asText());
} catch (Exception e) {
logger.error("=====HtmlUnitTestDemo===test1===>" + e.getMessage());
}finally{
//关闭客户端、释放内存
client.close();
}
}
/**
* 模拟浏览器的请求,解决部分网址拒绝访问
*/
@Test
public void test2() {
//初始化web客户端
WebClient client = new WebClient(BrowserVersion.FIREFOX_52);
try {
HtmlPage page = client.getPage("http://mvnrepository.com");
logger.debug("=====================>获取请求页面的html内容:" + page.asXml());
} catch (Exception e) {
logger.error("=======test2==========>" + e.getMessage());
}finally{
//关闭客户端释放内存
client.close();
}
}
/**
* 获取指定元素
*/
@Test
public void test3() {
//初始化web客户端
WebClient client = new WebClient(BrowserVersion.FIREFOX_52);
try {
HtmlPage page = client.getPage("http://blog.youkuaiyun.com/u010427935");
//获取指定id的html内容
HtmlDivision division = page.getHtmlElementById("navMenu");
logger.debug("================>指定html的内容:" + division.asXml());
//通过name获取html内容
DomNodeList<DomElement> tagList = page.getElementsByTagName("a");
for (DomElement domElement : tagList) {
logger.debug("===========>byTagName:" + domElement.asXml());
}
} catch (Exception e) {
logger.error("=====test3===========>" + e.getMessage());
}finally{
//关闭客户端释放内存
client.close();
}
}
/**
* 模拟点击事件实现搜索功能
*/
@Test
public void test4() {
//实例化web客户端
WebClient client = new WebClient(BrowserVersion.FIREFOX_52);
try {
//获取解析的页面
HtmlPage page = client.getPage("http://blog.java1234.com/index.html");
//获取提交的表单
HtmlForm form = page.getFormByName("myform");
//获取输入框
HtmlTextInput input = form.getInputByName("q");
//获取提交的按钮
HtmlSubmitInput submitInput = form.getInputByName("submitButton");
for (int i = 0; i < 1000; i++) {
//设置输入框的值
input.setValueAttribute("java" + i);
//模拟点击、提交表单
HtmlPage result = submitInput.click();
logger.info("===========>搜索的结果:" + result.asXml());
}
} catch (Exception e) {
logger.error("===========test4========>" + e.getMessage());
}finally{
//关闭客户端释放内存
client.close();
}
}
/**
* 代理IP
*/
@Test
public void test5() {
//初始化客户端
WebClient client = new WebClient(BrowserVersion.FIREFOX_52, "58.118.185.100", 8998);
try {
HtmlPage page = client.getPage("http://blog.youkuaiyun.com/u010427935");
logger.debug("=======html内容:====>" + page.asXml());
} catch (Exception e) {
logger.error("===========test5=========>" + e.getMessage());
}finally{
client.close();
}
}
/**
* 对于非js加载的页面取消js和css的解析
*/
@Test
public void test6() {
//实例化客户端
WebClient client = new WebClient(BrowserVersion.FIREFOX_52);
//取消客户端对js和css的解析
client.getOptions().setCssEnabled(false);
client.getOptions().setJavaScriptEnabled(false);
try {
HtmlPage page = client.getPage("http://blog.youkuaiyun.com/u010427935");
logger.debug("==============>获取的html内容:" + page.asXml());
} catch (Exception e) {
logger.error("========error========test6====>" + e.getMessage());
}finally{
client.close();
}
}
/**
* 爬取ajax加载的页面httpclient无法抓去到数据
*/
@Test
public void test7() {
WebClient client = new WebClient(BrowserVersion.FIREFOX_52);
HtmlPage page = null;
try {
page = client.getPage("https://pan.baidu.com/share/home?uk=305605848#category/type=0");
//线程休眠等待js加载
Thread.sleep(10000);
logger.info("==========>抓去到的html内容:" + page.asXml());
} catch (Exception e) {
logger.error("=====error===test7=====>" + e.getMessage());
}finally{
client.close();
}
}
}