package com.chongdong.log.test;import java.io.IOException;import java.util.HashMap;import
java.util.Map;import org.jsoup.Connection;import org.jsoup.Connection.Method;import
org.jsoup.Jsoup;import org.jsoup.helper.HttpConnection.Response;import org.jsoup.nodes.Document;import
org.jsoup.select.Elements;import org.junit.Test;/** * * 类名称:JsoupTest * 类描述: jsoup 抓取 mitsuku聊天 信息* 创建人:zk * 创建时间:2015-7-20 下午3:52:06 * 修改人:zk * 修改时间:2015-7-20
下午3:52:06 * 修改备注: * 开发进度:* @version 1.0 * */public
class
JsoupTest {
public static
void postMitSuKu(){ Map<String, String> map =
new HashMap<String, String>();
/** 表单 提交的参数 * input:how old are you botid:9fa364f2fe345a10 custid:c04f62ad1e044059 faq : http://www.pandorabots.com/botmaster/en/faq#h1 * **/ map.put("input",
"hi"); map.put("botid",
"9fa364f2fe345a10");
//开启 可在 跳转到聊天框界面 获取 标示 map.put("custid",
"c04f62ad1e044059");
//当一个空请求 即可获取 相当于 标示 Connection conn = Jsoup.connect("http://fiddle.pandorabots.com/pandora/talk-xml");
/* conn.header("(Request-Line)", "POST /cgi-bin/login?lang=zh_CN HTTP/1.1");*/ conn.header("Accept",
"*/*"); conn.header("Accept-Encoding",
"gzip,deflate,sdch"); conn.header("Accept-Language",
"zh-CN,zh;q=0.8");/* conn.header("Content-Length", "58");*/ conn.header("Origin",
"http://www.square-bear.co.uk"); conn.header("Pragma",
"no-cache"); conn.header("Connection",
"Keep-Alive");
//必须 填写 表单提交 conn.header("Content-Type",
"application/x-www-form-urlencoded"); conn.header("Host",
"fiddle.pandorabots.com"); conn.header("Referer",
"http://www.square-bear.co.uk/mitsuku/mitsy_retro.swf"); conn.header("User-Agent",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36");
try { Response response = (Response) conn.ignoreContentType(true).method(Method.POST).data(map).execute();
//response. String json=response.body(); System.out.println(json); }
catch (IOException e) {
// TODO Auto-generated catch block e.printStackTrace(); } }
/////////////////////////////////////////////////////////////////////////////////////////
//方案2: 通过 开发抓包工具可知 表单的提交方式 应该为http post 此处为 get 方法 不合适
/** * 请求英文对话的网页,抓取结果 * @param url * @return */
private static String
processLogic(String url){ String result =
""; try { Document document = Jsoup.connect(url).ignoreContentType(true).ignoreHttpErrors(true)
.followRedirects(true).timeout(5000).userAgent("Mozilla/5.0 (compatible; MSIE 9.0; Windows
NT 6.1; Trident/4.0;)").get(); Elements elements = document.select("result that"); result = elements.text(); System.out.println(result); }
catch (Exception e) { e.printStackTrace(); result =
"ok"; } return result; }
public static
void main(String[] args) {
for (int i =
0; i < 100; i++) {
// Thread thread=new Thread();
//thread.start(); postMitSuKu(); }
/* 通过 开发抓包工具可知 表单的提交方式 应该为http post 此处为 get 方法 不合适 String url = "http://fiddle.pandorabots.com/pandora/talk-xml?input=%s&botid=9fa364f2fe345a10&custid=bbbb30debe1bc7f7"; processLogic(url); */
}}
使用jsoup爬虫抓取页面
最新推荐文章于 2023-07-17 16:58:52 发布
