结果部分截图如下
我使用的是 IntelliJ IDEA ,首先先建立一个普通的maven项目
先创建一个实体类来存储电影信息
/**
* 存储页面信息实体类
*/
public class Page {
//id
private int id;
//评论数
private String commentcount;
//电影名称
private String title;
//电影介绍
private String jieshao;
//电影评分
private String fenshu;
//。。。。。此处省略getter和setter
}
创建一个通过URL来获取网页html的工具类PageDownLoadUtil,用来把网页内容下载下来
package util;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
public class PageDownLoadUtil {
public static String getPageContent(String url) {
HttpClientBuilder builder = HttpClients.custom();
CloseableHttpClient client = builder.build();
HttpGet request = new HttpGet(url);
String content = null;
try {
CloseableHttpResponse response = client.execute(request);
HttpEntity entity = response.getEntity();
content = EntityUtils.toString(entity);
} catch (IOException e) {
e.printStackTrace();
}
return content;
}
}
做一个提取标签的工具类 htmlUtil
package util;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException;
public class htmlUtil {
/**
* 获取标签属性值
* @param tagNode
* @param xpath
* @param att 匹配标签值
* @return
*/
public static String getAttributeByName(TagNode