前言
在XXapp输入关键字搜索,请求频繁时会出现验证码页面
更换新的cookie即可解决

代码
// String url = "https:/xxx?type=2&s_from8&_A";
String url = this.engineSite.getStartUrl();
ArrayList<Article> list = new ArrayList<Article>();
try {
// 替换关键字
url = url.replaceAll("ABC", URLEncoder.encode(keyword, "utf-8"));
// 替换页数
url = url.replace("CBA", pageorder + "");
// jsoup获取doc对象
Document doc = Jsoup.parse(new URL(url), 10000);
// 必要时,重启cookie
if (doc.getElementsByTag("ul").isEmpty()) {
// 通过HttpClient得到Cookies
HttpClient http = null;
CookieStore httpCookieStore = new BasicCookieStore();
http = HttpClientBuilder.create().setDefaultCookieStore(httpCookieStore).build();
HttpGet httpRequest = new HttpGet("https://XXX.com/v?ie=utf8&query=&p=40030600");
@SuppressWarnings("unused")
HttpResponse httpResponse = null;
try {
httpResponse = http.execute(httpRequest);
} catch (Exception e) {
logger.error("HttpClient Connect Error :" + e);
}
List<Cookie> cookies = httpCookieStore.getCookies();
// 设置cookie并发送请求
HashMap<String, String> cookiesMap = new HashMap<String, String>();
for (Cookie c : cookies) {
cookiesMap.put(c.getName(), c.getValue());
}
doc = Jsoup.connect(url).cookies(cookiesMap).get();
}
// 根据li标签获取具体内容
Elements Lis = doc.getElementsByTag("ul").get(1).getElementsByTag("li");
// 循环生成文章对象
for (int i = 0; i < Lis.size(); i++) {
Article article = new Article();
Element e = Lis.get(i);
// 设置文章网站
IWebsite web = new IWebsite();
web.setWebUrl("https://xxxx.com/");
web.setWebName("xxxn");
article.setWebsite(web);
// 设置url、author、content等
article.setUrl(web.getWebUrl() + e.getElementsByAttribute("href").first().attr("href"));
article.setAuthor(e.getElementById("XXX_11002601_account_" + i).text());
article.setAuthorUrl(web.getWebUrl() + e.getElementById("zzz_vr_11002601_account_" + i).attr("href"));
// 设置content为摘要
article.setContent(article.getDigest());
// 设置发布时间
String dataTime = getWeixinTime(e.selectFirst("div.s-p").attr("t").toString());
article.setPostTime(dataTime);
// 设置标题
article.setEngineName("XX");
article.setSubject(doc.getElementsByTag("h3").get(i).text());
// 设置唯一标识
article.setUnicodeString(article.getUrl());
list.add(article);
}
} catch (
Exception e) {
logger.error("Parse page Error :" + e);
}
return list;
上文代码中关键部分是重新拿取一个cookie发起请求。
效果


爬取上百行未出现验证码
最后
解决方案来自 https://blog.youkuaiyun.com/weixin_41186451/article/details/96980556
本帖仅供学习交流使用,禁止商用行为
1772

被折叠的 条评论
为什么被折叠?



