使用 HttpClient 和 jsoup 抓取[url=http://toutiao.io/]开发者头条[/url]中分享的所有文章(截止目前15000多条)。
数据:[url=http://dl.iteye.com/topics/download/9e452419-0219-397d-96a9-fd242c3cd989]点击下载[/url]
代码:[url=http://dl.iteye.com/topics/download/9557f77b-14aa-3866-9ede-084ca514c3d1]点击下载[/url]
[img]http://dl2.iteye.com/upload/attachment/0118/9834/d426a3fd-2c16-3020-b9b5-09da98f85272.png[/img]
[img]http://dl2.iteye.com/upload/attachment/0118/9830/15755518-af73-353c-b0d4-6b5c3a6a2911.png[/img]
数据:[url=http://dl.iteye.com/topics/download/9e452419-0219-397d-96a9-fd242c3cd989]点击下载[/url]
代码:[url=http://dl.iteye.com/topics/download/9557f77b-14aa-3866-9ede-084ca514c3d1]点击下载[/url]
[img]http://dl2.iteye.com/upload/attachment/0118/9834/d426a3fd-2c16-3020-b9b5-09da98f85272.png[/img]
[img]http://dl2.iteye.com/upload/attachment/0118/9830/15755518-af73-353c-b0d4-6b5c3a6a2911.png[/img]
public class ToutiaoArticles {
public static void main(String[] args) {
new ToutiaoArticles().fetch();
}
public void fetch() {
LocalDate startDate = new LocalDate(2014, 9, 27);
LocalDate endDate = LocalDate.now();
File outputFile = new File("D://data.csv");
String baseUrl = "http://toutiao.io/prev/";
PoolingHttpClientConnectionManager mgr = new PoolingHttpClientConnectionManager();
mgr.setMaxTotal(5);
mgr.setDefaultMaxPerRoute(5);
HttpClient httpClient = HttpClientBuilder.create().setConnectionManager(mgr).build();
HttpGet httpGet = null;
String date = null;
String url = null;
List<Link> linkInfos = null;
StringBuffer articleInfos = null;
while (startDate.isBefore(endDate) || startDate.isEqual(endDate)) {
date = startDate.toString("yyyy-MM-dd");
url = baseUrl + date;
System.out.println("[URL]-----" + url);
httpGet = new HttpGet(url);
try {
linkInfos = httpClient.execute(httpGet, new PageResponseHandler());
if (linkInfos != null) {
articleInfos = new StringBuffer();
for (int i = 0; i < linkInfos.size(); i++) {
Link k = linkInfos.get(i);
String data = date + "," + (i+1) + "," + k.getTitle() + "," + k.getOriginLink() + "," + k.getLink();
System.out.println(data);
articleInfos.append(data + "\r\n");
}
FileUtils.writeStringToFile(outputFile, articleInfos.toString(), "GBK", true);
}
} catch (Exception e) {
e.printStackTrace();
} finally {
httpGet.releaseConnection();
}
startDate = startDate.plusDays(1);
}
}
class PageResponseHandler implements ResponseHandler<List<Link>> {
@Override
public List<Link> handleResponse(HttpResponse response) throws ClientProtocolException, IOException {
HttpEntity entity = response.getEntity();
if (response.getStatusLine().getStatusCode() >= 300) {
EntityUtils.consume(entity);
return null;
}
if (entity == null) {
return null;
}
RequestConfig requestConfig = RequestConfig.custom().setRedirectsEnabled(false).build();
PoolingHttpClientConnectionManager mgr = new PoolingHttpClientConnectionManager();
mgr.setMaxTotal(5);
mgr.setDefaultMaxPerRoute(5);
HttpClient httpClient = HttpClientBuilder.create().setDefaultRequestConfig(requestConfig).setConnectionManager(mgr).build();
HttpGet httpGet = null;
HttpResponse httpResponse = null;
List<Link> linkInfos = new ArrayList<Link>();
Link lk = null;
String html = EntityUtils.toString(entity);
Document document = Jsoup.parse(html);
Elements links = document.getElementsByAttributeValue("target", "_blank");
for (int i = 0; i < links.size(); i++) {
lk = new Link();
lk.setLink(links.get(i).attr("href"));
lk.setTitle(links.get(i).text());
httpGet = new HttpGet(lk.getLink());
try {
httpResponse = httpClient.execute(httpGet);
if (httpResponse.getStatusLine().getStatusCode() == 302) {
String loc = httpResponse.getLastHeader("Location").getValue();
loc = loc.replaceAll("hmsr=toutiao.io", "");
loc = loc.replaceAll("&utm_medium=toutiao.io", "");
loc = loc.replaceAll("&utm_source=toutiao.io", "");
lk.setOriginLink(loc);
}
} catch (Exception e) {
e.printStackTrace();
} finally {
httpGet.releaseConnection();
}
linkInfos.add(lk);
}
return linkInfos;
}
}
class Link {
private String title;
private String link;
private String originLink;
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getLink() {
return link;
}
public void setLink(String link) {
this.link = link;
}
public String getOriginLink() {
return originLink;
}
public void setOriginLink(String originLink) {
this.originLink = originLink;
}
}
}