java htmlunit爬虫javascript爬坑-今日头条搜索操作

HtmlUnit-JavaScript爬坑

源码地址:https://download.youkuaiyun.com/download/NaXieNianYiShiGuJi/15729089

maven设置

    直接贴代码

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>com.test.htmlunit</groupId>
  <artifactId>htm-unit</artifactId>
  <version>0.0.1-SNAPSHOT</version>
  
  <properties>
		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
		<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
		<maven.compiler.encoding>UTF-8</maven.compiler.encoding>
	</properties>

	<build>
		<finalName>http-utils</finalName>
		<plugins>
			<plugin>
				<groupId>org.apache.maven.plugins</groupId>
				<artifactId>maven-javadoc-plugin</artifactId>
				<version>2.10.3</version>
				<configuration>
					<encoding>UTF-8</encoding>
				</configuration>
			</plugin>
			<plugin>
				<groupId>org.apache.maven.plugins</groupId>
				<artifactId>maven-resources-plugin</artifactId>
				<version>2.7</version>
				<configuration>
					<encoding>UTF-8</encoding>
				</configuration>
			</plugin>
			<plugin>
				<groupId>org.apache.maven.plugins</groupId>
				<artifactId>maven-compiler-plugin</artifactId>
				<version>3.5.1</version>
				<configuration>
					<source>1.8</source>
					<target>1.8</target>
					<encoding>UTF-8</encoding>
				</configuration>
			</plugin>
		</plugins>
	</build>

	<dependencies>
		<dependency>
			<groupId>junit</groupId>
			<artifactId>junit</artifactId>
			<version>4.12</version>
		</dependency>

		<dependency>
			<groupId>net.sourceforge.htmlunit</groupId>
			<artifactId>htmlunit</artifactId>
			<version>2.47.1</version>
			<!-- <version>2.29</version> -->
		</dependency>

		<dependency>
			<groupId>org.jsoup</groupId>
			<artifactId>jsoup</artifactId>
			<version>1.8.3</version>
		</dependency>
		<dependency>
			<groupId>dom4j</groupId>
			<artifactId>dom4j</artifactId>
			<version>1.6.1</version>
		</dependency>
		<dependency>
			<groupId>org.apache.commons</groupId>
			<artifactId>commons-lang3</artifactId>
			<version>3.4</version>
		</dependency>
		<dependency>
			<groupId>commons-codec</groupId>
			<artifactId>commons-codec</artifactId>
			<version>1.9</version>
		</dependency>
		<dependency>
			<groupId>commons-digester</groupId>
			<artifactId>commons-digester</artifactId>
			<version>2.0</version>
			<exclusions>
				<exclusion>
					<artifactId>commons-beanutils</artifactId>
					<groupId>commons-beanutils</groupId>
				</exclusion>
				<exclusion>
					<artifactId>commons-logging</artifactId>
					<groupId>commons-logging</groupId>
				</exclusion>
			</exclusions>
		</dependency>
		<dependency>
			<groupId>commons-beanutils</groupId>
			<artifactId>commons-beanutils</artifactId>
			<version>1.9.2</version>
			<exclusions>
				<exclusion>
					<artifactId>commons-logging</artifactId>
					<groupId>commons-logging</groupId>
				</exclusion>
			</exclusions>
		</dependency>
		<dependency>
			<groupId>commons-fileupload</groupId>
			<artifactId>commons-fileupload</artifactId>
			<version>1.3.1</version>
			<exclusions>
				<exclusion>
					<artifactId>commons-io</artifactId>
					<groupId>commons-io</groupId>
				</exclusion>
			</exclusions>
		</dependency>
		<dependency>
			<groupId>commons-io</groupId>
			<artifactId>commons-io</artifactId>
			<version>2.5</version>
		</dependency>
		<dependency>
			<groupId>commons-logging</groupId>
			<artifactId>commons-logging</artifactId>
			<version>1.2</version>
		</dependency>
		<dependency>
			<groupId>commons-httpclient</groupId>
			<artifactId>commons-httpclient</artifactId>
			<version>3.1</version>
			<exclusions>
				<exclusion>
					<artifactId>commons-codec</artifactId>
					<groupId>commons-codec</groupId>
				</exclusion>
				<exclusion>
					<artifactId>commons-logging</artifactId>
					<groupId>commons-logging</groupId>
				</exclusion>
			</exclusions>
		</dependency>
	</dependencies>
</project>

常规方法介绍

       直接贴代码,注释很详细,上手很轻松

public static void main(String[] args) throws Exception {
		WebClient webClient = new WebClient(BrowserVersion.CHROME);//新建一个模拟谷歌Chrome浏览器的浏览器客户端对象

		webClient.getOptions().setThrowExceptionOnScriptError(false);//当JS执行出错的时候是否抛出异常, 这里选择不需要
		webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);//当HTTP的状态非200时是否抛出异常, 这里选择不需要
		webClient.getOptions().setActiveXNative(false);
		//webClient.getOptions().setCssEnabled(false);//是否启用CSS, 因为不需要展现页面, 所以不需要启用
		webClient.getOptions().setCssEnabled(false);//是否启用CSS, 因为不需要展现页面, 所以不需要启用
		webClient.getOptions().setJavaScriptEnabled(true); //很重要,启用JS
		webClient.setAjaxController(new NicelyResynchronizingAjaxController());//很重要,设置支持AJAX
		try {
			// 拿到这个网页  
	            HtmlPage page = webClient.getPage("https://www.baidu.com");  
	            HtmlInput username = (HtmlInput)page.getElementById("kw");
	            username.type("李晟");
	            HtmlInput su = (HtmlInput)page.getElementById("su");
                    //input点击
	            HtmlPage nextPage = su.click();  
	            webClient.waitForBackgroundJavaScript(10000);//异步JS执行需要耗时,所以这里线程要阻塞30秒,等待异步JS执行结束
	            System.out.println(nextPage.asXml()); 
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			webClient.close();
		}
	}

     调用页面JavaScript方法,贴代码

                //简单function
	        String javaScriptCode = "funtion1('235','234')";
	        ScriptResult result = nextPage.executeJavaScript(javaScriptCode); 
	        //webClient.waitForBackgroundJavaScript(3000);//异步JS执行需要耗时,所以这里线程要阻塞30秒,等待异步JS执行结束
	        //如果是ajax,则还需重新请求
	        HtmlPage nextPage2=(HtmlPage)result.getNewPage();
	        System.out.println(nextPage2.asXml());

    是不是感觉很简单,当然调用JavaScript方法需要对页面源码有一定阅读能力

爬坑

    好了,简单入手就到这里了,下面来个挑战

    需求如下,打开头条搜索,搜索关键词中国新闻网,点击用户,点击中国新闻网

             

    从如图看,我们分解下步骤

    1、我们可以直接访问头条搜索地址https://www.toutiao.com/search/?keyword=中国新闻网

    2、点击用户

    3、点击中国新闻网

    直接贴代码

        try {
		// 由于头条是https需要设置https支持
		webClient.getOptions().setUseInsecureSSL(true);
		//不想看javascript报错设置如下
                webClient.getOptions().setThrowExceptionOnScriptError(false);
		JavaScriptEngine engine = (JavaScriptEngine)webClient.getJavaScriptEngine();
		engine.holdPosponedActions();
	        HtmlPage page = webClient.getPage("https://www.toutiao.com/search/?keyword=中国新闻网");  
	        webClient.waitForBackgroundJavaScript(10000);//异步JS执行需要耗时,所以这里线程要阻塞30秒,等待异步JS执行结束
	        //快速查找<div class="tabBar"><ul><li>综合</li><li>视频</li><li>用户</li></ul></div>
	        List<HtmlListItem> tabli=page.getByXPath("//div[@class='tabBar']/ul/li[3]");
	        HtmlPage page2=tabli.get(0).click();
	        webClient.waitForBackgroundJavaScript(10000);//异步JS执行需要耗时,所以这里线程要阻塞30秒,等待异步JS执行结束
	        //System.out.println(page2.asXml());
	        //dom快速解析
	        Document doc = Jsoup.parse(page2.asXml());
	        Element element = doc.select("div[class=tabBar]").first();
	        System.out.println(":::::::::::::::::::::"+element.html());
	} catch (Exception e) {
		e.printStackTrace();
	} finally {
		webClient.close();
	}

    代码中有段doc.select("div[class=tabBar]"),附下jsoup的select说明

 

    跑个代码看看,满怀期待

    没想着一次搞定,果然我们拿不到搜索的内容

    看看先看看浏览器的网络

     再在控制台找找,看有没有走这个ajax请求

    控制台有这个ajax请求的链接,那为啥我们没有拿到信息呢,我们再看看控制台

    原来头条的js经过混淆压缩了,导致htmlunit无法加载内容

    想想其他办法,既然ajax请求链接在控制台有打印,那我们想办法拿到ajax请求链接不就好了吗

    看看源码

   

      貌似没有啥接口开放啊,不过没关系,java么, 既然有这个东西,那我重写下呗,没有我拿不到的东西

package com.lz.test;

import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebRequest;
import com.gargoylesoftware.htmlunit.html.HtmlPage;

public class MyNicelyResynchronizingAjaxController extends NicelyResynchronizingAjaxController {
	public WebRequest ajaxSetting;

	public WebRequest getAjaxSetting() {
		return ajaxSetting;
	}

	public void setAjaxSetting(WebRequest ajaxSetting) {
		this.ajaxSetting = ajaxSetting;
	}

	@Override
	public boolean processSynchron(final HtmlPage page, final WebRequest settings, final boolean async) {
		ajaxSetting = settings;
		super.processSynchron(page, settings, async);
		return !async;
	}
}

    把webClient.setAjaxController(new NicelyResynchronizingAjaxController());换成我的,顺便对页面加一下错误监听,直接贴代码

public static void main(String[] args) throws Exception {
		WebClient webClient = new WebClient(BrowserVersion.CHROME);//新建一个模拟谷歌Chrome浏览器的浏览器客户端对象

		webClient.getOptions().setThrowExceptionOnScriptError(false);//当JS执行出错的时候是否抛出异常, 这里选择不需要
		webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);//当HTTP的状态非200时是否抛出异常, 这里选择不需要
		webClient.getOptions().setActiveXNative(false);
		//webClient.getOptions().setCssEnabled(false);//是否启用CSS, 因为不需要展现页面, 所以不需要启用
		webClient.getOptions().setCssEnabled(false);//是否启用CSS, 因为不需要展现页面, 所以不需要启用
		webClient.getOptions().setJavaScriptEnabled(true); //很重要,启用JS
		MyNicelyResynchronizingAjaxController myajax=new MyNicelyResynchronizingAjaxController();
		webClient.setAjaxController(myajax);//很重要,设置支持AJAX
		//webClient.setAjaxController(new NicelyResynchronizingAjaxController());//很重要,设置支持AJAX
		/**
		try {
			// 拿到这个网页  
	        HtmlPage page = webClient.getPage("https://www.baidu.com");  
	        HtmlInput username = (HtmlInput)page.getElementById("kw");
	        username.type("李晟");
	        HtmlInput su = (HtmlInput)page.getElementById("su");
	        HtmlPage nextPage = su.click();  
	        webClient.waitForBackgroundJavaScript(10000);//异步JS执行需要耗时,所以这里线程要阻塞30秒,等待异步JS执行结束
	        
	        System.out.println(nextPage.asXml());  
	        //简单function
	        String javaScriptCode = "funtion1('235','234')";
	        ScriptResult result = nextPage.executeJavaScript(javaScriptCode); 
	        //webClient.waitForBackgroundJavaScript(3000);//异步JS执行需要耗时,所以这里线程要阻塞30秒,等待异步JS执行结束
	        //如果是ajax,则还需重新请求
	        HtmlPage nextPage2=(HtmlPage)result.getNewPage();
	        System.out.println(nextPage2.asXml());
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			webClient.close();
		}
		*/
		try {
			// 由于头条是https需要设置https支持
			webClient.getOptions().setUseInsecureSSL(true);
			//不想看javascript报错设置如下
			//webClient.getOptions().setThrowExceptionOnScriptError(false);
			//javascript报错监听
			webClient.setJavaScriptErrorListener(new JavaScriptErrorListener() {
				
				@Override
				public void warn(String message, String sourceName, int line, String lineSource, int lineOffset) {
					// TODO Auto-generated method stub
					System.out.println("warn");
				}
				
				@Override
				public void timeoutError(HtmlPage page, long allowedTime, long executionTime) {
					// TODO Auto-generated method stub
					System.out.println("timeoutError");
				}
				
				@Override
				public void scriptException(HtmlPage page, ScriptException scriptException) {
					// TODO Auto-generated method stub
					System.out.println("scriptException"+page.getUrl());
				}
				
				@Override
				public void malformedScriptURL(HtmlPage page, String url, MalformedURLException malformedURLException) {
					// TODO Auto-generated method stub
					System.out.println("malformedScriptURL"+url);
				}
				
				@Override
				public void loadScriptError(HtmlPage page, URL scriptUrl, Exception exception) {
					// TODO Auto-generated method stub
					System.out.println("loadScriptError"+scriptUrl);
				}
			});
			JavaScriptEngine engine = (JavaScriptEngine)webClient.getJavaScriptEngine();
			engine.holdPosponedActions();
	                HtmlPage page = webClient.getPage("https://www.toutiao.com/search/?keyword=中国新闻网");  
	                webClient.waitForBackgroundJavaScript(10000);//异步JS执行需要耗时,所以这里线程要阻塞30秒,等待异步JS执行结束
	                //快速查找<div class="tabBar"><ul><li>综合</li><li>视频</li><li>用户</li></ul></div>
	                List<HtmlListItem> tabli=page.getByXPath("//div[@class='tabBar']/ul/li[3]");
	                HtmlPage page2=tabli.get(0).click();
	                webClient.waitForBackgroundJavaScript(10000);//异步JS执行需要耗时,所以这里线程要阻塞30秒,等待异步JS执行结束
	                System.out.println(page2.asXml());
	                //dom快速解析
	                Document doc = Jsoup.parse(page2.asXml());
	                Element element = doc.select("div[class=sections]").first();
	                System.out.println(":::::::::::::::::::::"+element.html());
	                //拿到ajax请求地址
	                WebRequest ajaxSetting = myajax.getAjaxSetting();
	                System.out.println("ajax=============="+ajaxSetting.getUrl());
	                List<HtmlAnchor> div_a=page2.getByXPath("//div[@class='sections']/div[1]/a");
	                //System.out.println(div_a.get(0).toString());
	                HtmlPage page3=div_a.get(0).click();
	                webClient.waitForBackgroundJavaScript(10000);//异步JS执行需要耗时,所以这里线程要阻塞30秒,等待异步JS执行结束
	                System.out.println(page3.asXml());
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			webClient.close();
		}
	}

    代码跑起来,看看控制台, 结果如下,至此问题解决,

源码地址:https://download.youkuaiyun.com/download/NaXieNianYiShiGuJi/15729089

源码地址:https://download.youkuaiyun.com/download/NaXieNianYiShiGuJi/15729089

源码地址:https://download.youkuaiyun.com/download/NaXieNianYiShiGuJi/15729089

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值