抓取 开发者头条 分享的所有文章

本文介绍了一种使用Java中的HttpClient和jsoup库来抓取开发者头条网站上所有文章的方法。通过该方法可以批量获取从2014年9月27日至当前日期的所有文章链接和相关信息,并将数据保存为CSV文件。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

使用 HttpClient 和 jsoup 抓取[url=http://toutiao.io/]开发者头条[/url]中分享的所有文章(截止目前15000多条)。

数据:[url=http://dl.iteye.com/topics/download/9e452419-0219-397d-96a9-fd242c3cd989]点击下载[/url]
代码:[url=http://dl.iteye.com/topics/download/9557f77b-14aa-3866-9ede-084ca514c3d1]点击下载[/url]

[img]http://dl2.iteye.com/upload/attachment/0118/9834/d426a3fd-2c16-3020-b9b5-09da98f85272.png[/img]
[img]http://dl2.iteye.com/upload/attachment/0118/9830/15755518-af73-353c-b0d4-6b5c3a6a2911.png[/img]


public class ToutiaoArticles {

public static void main(String[] args) {
new ToutiaoArticles().fetch();
}

public void fetch() {
LocalDate startDate = new LocalDate(2014, 9, 27);
LocalDate endDate = LocalDate.now();
File outputFile = new File("D://data.csv");
String baseUrl = "http://toutiao.io/prev/";

PoolingHttpClientConnectionManager mgr = new PoolingHttpClientConnectionManager();
mgr.setMaxTotal(5);
mgr.setDefaultMaxPerRoute(5);
HttpClient httpClient = HttpClientBuilder.create().setConnectionManager(mgr).build();
HttpGet httpGet = null;

String date = null;
String url = null;
List<Link> linkInfos = null;
StringBuffer articleInfos = null;

while (startDate.isBefore(endDate) || startDate.isEqual(endDate)) {
date = startDate.toString("yyyy-MM-dd");
url = baseUrl + date;
System.out.println("[URL]-----" + url);
httpGet = new HttpGet(url);
try {
linkInfos = httpClient.execute(httpGet, new PageResponseHandler());
if (linkInfos != null) {
articleInfos = new StringBuffer();
for (int i = 0; i < linkInfos.size(); i++) {
Link k = linkInfos.get(i);
String data = date + "," + (i+1) + "," + k.getTitle() + "," + k.getOriginLink() + "," + k.getLink();
System.out.println(data);
articleInfos.append(data + "\r\n");
}
FileUtils.writeStringToFile(outputFile, articleInfos.toString(), "GBK", true);
}
} catch (Exception e) {
e.printStackTrace();
} finally {
httpGet.releaseConnection();
}
startDate = startDate.plusDays(1);
}
}

class PageResponseHandler implements ResponseHandler<List<Link>> {

@Override
public List<Link> handleResponse(HttpResponse response) throws ClientProtocolException, IOException {

HttpEntity entity = response.getEntity();

if (response.getStatusLine().getStatusCode() >= 300) {
EntityUtils.consume(entity);
return null;
}

if (entity == null) {
return null;
}

RequestConfig requestConfig = RequestConfig.custom().setRedirectsEnabled(false).build();
PoolingHttpClientConnectionManager mgr = new PoolingHttpClientConnectionManager();
mgr.setMaxTotal(5);
mgr.setDefaultMaxPerRoute(5);
HttpClient httpClient = HttpClientBuilder.create().setDefaultRequestConfig(requestConfig).setConnectionManager(mgr).build();
HttpGet httpGet = null;
HttpResponse httpResponse = null;

List<Link> linkInfos = new ArrayList<Link>();
Link lk = null;

String html = EntityUtils.toString(entity);

Document document = Jsoup.parse(html);
Elements links = document.getElementsByAttributeValue("target", "_blank");
for (int i = 0; i < links.size(); i++) {
lk = new Link();
lk.setLink(links.get(i).attr("href"));
lk.setTitle(links.get(i).text());

httpGet = new HttpGet(lk.getLink());
try {
httpResponse = httpClient.execute(httpGet);
if (httpResponse.getStatusLine().getStatusCode() == 302) {
String loc = httpResponse.getLastHeader("Location").getValue();
loc = loc.replaceAll("hmsr=toutiao.io", "");
loc = loc.replaceAll("&utm_medium=toutiao.io", "");
loc = loc.replaceAll("&utm_source=toutiao.io", "");
lk.setOriginLink(loc);
}

} catch (Exception e) {
e.printStackTrace();
} finally {
httpGet.releaseConnection();
}

linkInfos.add(lk);
}

return linkInfos;
}

}

class Link {
private String title;
private String link;
private String originLink;

public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getLink() {
return link;
}
public void setLink(String link) {
this.link = link;
}
public String getOriginLink() {
return originLink;
}
public void setOriginLink(String originLink) {
this.originLink = originLink;
}
}

}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值