private static void crawl36() {
try {
String urls = "http://www.36kr.com/topics/recent?page=1";
String site = "http://www.36kr.com";
HttpURLConnection con = getHttpURLConnection(urls);
con.setRequestProperty(
"User-Agent",
"Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10A5376e Safari/8536.25");
con.setRequestMethod("GET");
String contentType = con.getContentType();
if (contentType.indexOf("charset") == -1) {
contentType = "GBK";
} else {
contentType = contentType.substring(
contentType.indexOf("=") + 1, contentType.length());
}
Document document = Jsoup.parse(IOUtils.toString(
con.getInputStream(), contentType));
Elements elements = document.select(".infos");
List<News> listNews = new ArrayList<News>();
News news = null;
for (Element element : elements) {
Element elementUrl = element.select(".title").get(0);
String url = elementUrl.getElementsByTag("a").attr("href");
String title = elementUrl.getElementsByTag("a").text();
String tag = element.select(".node").text();
news = new News();
news.setUrl(site + url);
news.setTitle(title);
news.setTag(tag);
listNews.add(news);
}
System.out.println("抓取" + listNews.size() + "条");
for (int i = 0; i < listNews.size(); i++) {
System.out.print(listNews.get(i));
}
} catch (Exception e) {
}
}
待续