package com.tl.spider.parser.impl;
import com.tl.spider.download.WebPageDownLoadUtil;
import com.tl.spider.parser.interfaces.ParseFieldsInterface;
import com.tl.spider.pojos.ParserResultEntity;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
/**
* @ClassName ParseFields4Xpath
* @Description 使用Jsoup实现解析类
* @Author Administrator
* @Date 2019/5/19 16:09
* @Version 1.0
**/
public class ParseFields4Xpath implements ParseFieldsInterface {
@Override
public List<ParserResultEntity> parseHtml(String htmlContent) {
Document doc = Jsoup.parse(htmlContent); // 从字符串中输入 HTML 文档
Element element = doc.select("ul.tj3_1").first();
Elements elements = element.select("li");
List<ParserResultEntity> results = new ArrayList<>();
Date currentData = new Date(System.currentTimeMillis());
for(Element e : elements) {
ParserResultEntity obj = new ParserResultEntity();
obj.setTitle(e.select("a").text());
obj.setPostDate(e.select("font").text());
obj.setInsertDate(currentData.toString());
results.add(obj);
}
/*
// css语法提取标题和时间
System.out.println(doc.select("body > div.main > div.main_l > div.rdwz > ul > li:nth-child(1) > a").text());
System.out.println(doc.select("body > div.main > div.main_l > div.rdwz > ul > li:nth-child(1) > font").text());
*/
return results;
}
public static void main(String[] args) throws Exception {
String url = "http://news.youth.cn/gn/";
String charSet = "gb2312"; // 这个地方的编码可以通过查看网页源代码的meta charset得到
String content = WebPageDownLoadUtil.getHtmlSourceBySocket(url, charSet);
System.out.println(content.length());
ParseFields4Xpath parseFields4Xpath = new ParseFields4Xpath();
List<ParserResultEntity> results = parseFields4Xpath.parseHtml(content);
for(ParserResultEntity message : results) {
System.out.println(message.toString());
}
}
}