利用jsoup进行爬取,为公司爬取问答
用得是springboot框架,直接上代码
controller层:
@Controller
public class QuestionController {
@Autowired
QuestionService questionService;
@RequestMapping("/api/spider/get")
@ResponseBody
public void get(Question question) {
questionService.question_crawl();
}
}
service层:没有配数据库,所以插入语句注释了。URL是公司让爬取的
package com.example.demo2.service;
import com.example.demo2.dao.QuestionDao;
import com.example.demo2.dao.QuestionMapper;
import com.example.demo2.entity.Answer;
import com.example.demo2.entity.Question;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
@Service
public class QuestionService {
@Autowired
QuestionDao questionDao;
public void question_crawl() {
try {
int x = 1;
int status = 1;
while (x <2) {
String url = "https://www.xinshipu.com/question?page=" + x;
Document doc = Jsoup.connect(url).get();
//获取页面问题
Elements elementsByClass = doc.getElementsByClass("bpannel qandahr p15").select("ul").select("li");
// System.out.println("个数:" + elementsByClass.size());
for (Element element : elementsByClass) {
String attr = element.getElementsByClass("fl w410").select("a").attr("href");
show(attr, status);
}
x++;
}
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 显示内容方法
*/
public void show(String attr, int status) throws Exception {
Document document = Jsoup.connect("https://www.xinshipu.com/" + attr).get();
String text1 = document.getElementsByClass("font16").select("p").text();
//问题
String[] text = text1.split("回答问题");
Question po = new Question();
po.setType(1);
po.setStatus(1);
po.setUrl("https://www.xinshipu.com/" + attr);
po.setText(text[0]);
System.out.println("问题:"+text[0]);
// questionDao.insertQuestion(po);
/**
* 显示答案
*/
Elements select = document.getElementsByClass("cg1").select("span[style]");//选择class"cg1 "+带有style属性的span元素
for (Element element : select) {
//System.out.println("\nanswer:" + element.text().trim());
Answer answer=new Answer();
answer.setText(element.text().trim());//trim删除空格
answer.setStatus(1);
answer.setQuestionid(po.getId());
System.out.println("答案:"+element.text().trim());
// questionDao.insertAnswer(answer);
}
}
}
dao层和mapper层写在一起了,因为懒得创建包 - - !上代码
package com.example.demo2.dao;
import com.example.demo2.entity.Answer;
import com.example.demo2.entity.Question;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Repository;
@Repository
public class QuestionDao {
@Autowired
QuestionMapper questionMapper;
public void insertQuestion(Question po){
questionMapper.insertQuestion(po);
}
public void insertAnswer(Answer answer) {
questionMapper.insertAnswer(answer);
}
}
package com.example.demo2.dao;
import com.example.demo2.entity.Answer;
import com.example.demo2.entity.Question;
import org.apache.ibatis.annotations.Mapper;
import org.springframework.stereotype.Repository;
@Mapper
@Repository
public interface QuestionMapper {
void insertQuestion(Question question);
void insertAnswer(Answer answer);
}
实体就不具体写了,需要什么大家自己创建
public class Answer {
private int id;
private String text;
private int questionid;
private int status;
private Data createtime;
//getter and setter 自己写吧
public class Question {
private int id;
private String text;
private int status;
private int type;
private String url;
private Date createtime;
//getter and setter 自己写吧
具体的数据库配置就不写了,mapper.xml文件中的sql语句我相信大家都会(我是菜鸟都会写)!
测试
先打开浏览器,输入http://localhost:8089/api/spider/get 确定,在控制台打印结果如下
第一次写博客,随便写写,不知道怎么整理话语,希望大家见谅,多多给意见