<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.8.3</version>
</dependency>
package com.tps.common;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
/**
* (爬虫)
*
* @author Sum
* @date 2017-09-29 9:13
*/
public class Robot {
public static void getQuestion(int page){
// 定义即将访问的链接
String url = "https://www.nowcoder.com/ta/review-java/review?page="+page;
// 定义一个字符串用来存储网页内容
String result = "";
// 定义一个缓冲字符输入流
BufferedReader in = null;
try
{
// 将string转成url对象
URL realUrl = new URL(url);
// 初始化一个链接到那个url的连接
URLConnection connection = realUrl.openConnection();
// 开始实际的连接
connection.connect();
// 初始化 BufferedReader输入流来读取URL的响应
in = new BufferedReader(new InputStreamReader(connection.getInputStream()));
// 用来临时存储抓取到的每一行的数据
String line;
while ((line = in.readLine()) != null)
{
// 遍历抓取到的每一行并将其存储到result里面
result += line + "\n";
}
} catch (Exception e)
{
System.out.println("发送GET请求出现异常!" + e);
e.printStackTrace();
} // 使用finally来关闭输入流
finally
{
try
{
if (in != null)
{
in.close();
}
} catch (Exception e2)
{
e2.printStackTrace();
}
}
// System.out.println(result);
Document doc = Jsoup.parse(result);
Elements title = doc.getElementsByClass("final-question");
System.out.println(page+"."+title.get(0).ownText());
Elements question = doc.getElementsByClass("design-answer-box");
System.out.println(question.get(0).ownText());
System.out.println(question.get(0));
}
public static void main(String[] args)
{
/*for(int i=1;i<120;i++){
getQuestion(i);
}*/
getQuestion(13);
}
}