为了课程设计准备数据,爬取驾考的题库,包括题目、选项、答案、解析、图片
模型:Question
@Entity
public class Question {
private int id;
private Integer num;
private String type;
private String chapter;
private String question;
private String options;
private String answer;
private String analysis;
private String picpath;
private String picurl;
...
}
目标网址:https://www.ybjk.com/tiku/02dec.htm
public void getInfo(QuestionService questionService) throws IOException {
//1: 科目一 4: 科目四
String url = "";
String subject = "";
url = "https://www.ybjk.com/tiku/02dec.htm";
subject = "1";
// url = "https://www.ybjk.com/tiku/d704f.htm";
// subject = "4";
int i = 1;
while (!url.equals("")) {
Document document = Jsoup.connect(url).timeout(4000).userAgent("Mozilla").get();
String num = "";
String type = "";
String chapter = "";
String question = "";
String option = "";
String answer = "";
String analysis = "";
String picpath = "";
String picurl = "";
//序号
num = new Integer(i++).toString();
//章节
chapter = document.select(".h_Nav a").get(3).text();
//题目
question = document.selectFirst("#WinContent div strong a").text();
//正确答案
answer = document.selectFirst("#WinContent div ul li i u").text();
//分析本题类型
if (answer.equals("对") || answer.equals("错")) {
//判断题 0代表科目一
type = subject +":judge";
} else if (answer.length() == 1) {
//单选题
type = subject + ":one";
} else {
//多选题
type = subject + ":more";
}
//选择题选项
if (type.equals(subject + ":one") || type.equals(subject + ":more")) {
//选项option
Elements elements = document.select("#WinContent div ul li");
int f = 0;
for (Element element : elements) {
if (f++ < 4) {
option += element.text() + ":";
}
}
} else {
//判断题
option = "对:错";
}
//题目解析
analysis = document.select("#NoteContent p").get(1).text();
//图片
Element picElement = document.selectFirst("#WinContent div div img.min");
if (picElement != null) {
picurl = "https:" + picElement.attr("src");
writePicTo("H:/dirver_image/", subject + "_" + num + ".gif", picurl);
} else {
picpath = "0";
picurl = "0";
}
System.out.println("题号: " + num);
System.out.println("类型: " + type);
System.out.println("章节: " + chapter);
System.out.println("题目: " + question);
System.out.println("选项: " + option);
System.out.println("答案: " + answer);
System.out.println("题解: " + analysis);
System.out.println("图片: " + picurl);
System.out.println("图片本地地址: " + picpath);
Question question1 = new Question();
question1.setNum(Integer.parseInt(num));
question1.setType(type);
question1.setChapter(chapter);
question1.setQuestion(question);
question1.setOptions(option);
question1.setAnswer(answer);
question1.setAnalysis(analysis);
question1.setPicpath(picpath);
question1.setPicurl(picurl);
//保存
questionService.add(question1);
//下一题url
url = "https://www.ybjk.com" + document.select("#WinContent div table tr td a").last().attr("href");
System.out.println("url: " + url);
}
}
//爬图片 path:输出路径 name:图片名(含后缀名) url:图片地址
private void writePicTo(String path, String name, String url) {
URL urlConn = null;
URLConnection connection = null;
InputStream is = null;
OutputStream os = null;
try {
urlConn = new URL(url);
//打开网址连接
connection = urlConn.openConnection();
//获取连接中的数据流
is = connection.getInputStream();
//设置输出流
os = new FileOutputStream(new File(path + name));
System.out.println("保存图片到: " + path + name);
//将输入流中的数据输出到输出流中
byte[] b = new byte[1024];
int count = 0;
while((count=is.read(b))!=-1){
os.write(b,0,count);
}
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}