根据url读取html有两种方式
1.HttpURLConnection
2.Jsoup
两种方式的maven依赖:
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpcore -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpcore</artifactId>
<version>4.4.5</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.6</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpmime</artifactId>
<version>4.5.2</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.google.guava/guava -->
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>27.0.1-jre</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
两种方式的代码实现
两种读取方式及部分其他方法:
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
public class ReadHTMLByUrl {
/**
* url读取html文件
* @param u
* @param encoding
* @return
* @throws Exception
*/
public static String readFile(String u, String encoding) throws Exception {
StringBuffer html = new StringBuffer();
URL url = new URL(u);// 根据链接(字符串格式),生成一个URL对象
HttpURLConnection urlConnection = (HttpURLConnection) url
.openConnection();// 打开URL
BufferedReader reader = new BufferedReader(new InputStreamReader(
urlConnection.getInputStream(), encoding));// 得到输入流,即获得了网页的内容
String line; // 读取输入流的数据,并显示
while ((line = reader.readLine()) != null) {
html.append(line);
}
return html.toString();
}
/**
* 按照url从网络上直接读取html下body的内容
* @param url
* @return
* @throws IOException
*/
public static String JsoupBodyHtml(String url){
Document doc = null;
try {
doc = Jsoup.connect(url).get();
} catch (IOException e) {
for(int i=0;i<3;i++) {
try {
Thread.sleep(5*1000);
doc = Jsoup.connect(url).get();
//成功建立连接跳出循环
break;
} catch (InterruptedException e1) {
} catch (IOException e1) {
}
}
}
return doc == null ? "":doc.body() == null ? "":doc.body().html();
}
/**
* 按照url从网络上直接读取html下的内容
* @param url
* @return
* @throws IOException
*/
public static String JsoupHtml(String url) {
Document doc = null;
try {
doc = Jsoup.connect(url).get();
} catch (IOException e) {
try {
Thread.sleep(20*1000);
doc = Jsoup.connect(url).get();
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
} catch (InterruptedException e1) {
e1.printStackTrace();
}
e.printStackTrace();
}
return doc==null ? "" : doc.toString();
}
/**
* 根据网页的url清空带有字体的style属性,替换img的src
* @param url
* @return
*/
public static String reWriteHtml(String url) {
try {
//截取url
String domainUrl = url.substring(0, url.lastIndexOf("/") + 1);
Document doc = Jsoup.connect(url).get();
Element body = doc.body();
//查找img
List<Element> imgs = body.select("img");
if(null == imgs) {
imgs = new ArrayList();
}
String src;
for(Element img : imgs){
src = img.attr("src");
//图片相对路径改为绝对路径
src = src.startsWith("http") ? src : domainUrl + src;
img.attr("src", src);
}
List<Element> eList = body.getAllElements();
if(null == eList) {
eList = new ArrayList();
}
String style;
//移除带有font属性的样式
for(Element e : eList){
style = e.attr("style");
if(style.indexOf("font") > -1){
e.removeAttr("style");
}
}
return body.html();
}catch(Exception e) {
e.printStackTrace();
return "";
}
}
/**
* 将img标签中的src进行二次包装
* @param content 内容
* @param replaceHttp 需要在src中加入的域名
* @return
*/
public static String repairContent(String url){
String content =JsoupBodyHtml(url);
if("".equals(content)) {
return "";
}
//加在img src中的前缀
String replaceHttp = url.substring(0, url.lastIndexOf("/")+1);
String patternStr="<img\\s*([^>]*)\\s*src=\\\"(.*?)\\\"\\s*([^>]*)>";
content = replSrc(content, replaceHttp, patternStr);
return content;
}
/**
* 替换src后的div内容
* @param content
* @param replaceHttp
* @param patternStr
* @return
*/
private static String replSrc(String content, String replaceHttp, String patternStr) {
Pattern pattern = Pattern.compile(patternStr,Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(content);
Map<String,String> repMap = new HashMap<String,String>();
//将所有的匹配数据放到repMap中去重,防止重复替换
while(matcher.find()) {
String src = matcher.group(2);
String replaceSrc = "";
//只替换非 http:// 或 https:// 的src
if(!src.startsWith("http://")&&!src.startsWith("https://")){
replaceSrc = replaceHttp + src;
//按key去重
if (!repMap.containsKey(src)) {
repMap.put(src, replaceSrc);
}
}
}
//去重替换文件
for (String key : repMap.keySet()) {
content = content.replace(key, repMap.get(key));
}
return content;
}
/**
* 获得要替换的map
* @param content
* @param replaceHttp
* @param pattern
* @param matcher
* @return
*/
private static String matchSrc(String content, String replaceHttp, Pattern pattern, Matcher matcher) {
Map<String,String> repMap = new HashMap<String,String>();
//将所有的匹配数据放到repMap中去重,防止重复替换
while(matcher.find()) {
String src = matcher.group(2);
String replaceSrc = "";
if(!src.startsWith("http://")&&!src.startsWith("https://")){
replaceSrc = replaceHttp + src;
if (!repMap.containsKey(src)) {
repMap.put(src, replaceSrc);
}
}
}
//按key去重
for (String key : repMap.keySet()) {
content = content.replace(key, repMap.get(key));
}
return content;
}
private static List<String> getMatchers(String regex, String source){
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(source);
List<String> list = new ArrayList<String>();
while (matcher.find()) {
list.add(matcher.group(2));
}
return list;
}
}