爬取url标题,内容,icon
前言
前几天接个需求,用户输入文章链接后,自动显示出链接的标题,内容和icon,下面是具体实现。
一、Controller层
package com.example.study.web.rest;
import com.example.study.service.DTO.URLContentDTO;
import com.example.study.service.GetHTMLService;
import io.swagger.annotations.Api;
import io.swagger.annotations.ApiOperation;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
@RestController
@RequestMapping("/html")
public class HTMLResource {
private final GetHTMLService getHTMLService;
public HTMLResource(GetHTMLService getHTMLService) {
this.getHTMLService = getHTMLService;
}
@ApiOperation("2.识别URL内容(爬取标题,内容,icon)")
@GetMapping("/get-content")
public ResponseEntity<URLContentDTO> getURLContent(@RequestParam(name = "url") String url) throws Exception {
URLContentDTO urlContentDTO = getHTMLService.getContent(url);
return ResponseEntity.ok(urlContentDTO);
}
}
二、Service层
package com.example.study.service;
import com.example.study.service.DTO.URLContentDTO;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.springframework.stereotype.Service;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@Service
public class GetHTMLService {
public static URLContentDTO getContent(String url) throws IOException {
Document doc = Jsoup.connect(url).get();
String image = "";
if (null != doc.getElementsByAttributeValueContaining("rel", "shortcut icon")) {
String image1 = doc.getElementsByAttributeValueContaining("rel", "shortcut icon").attr("href");
if (StringUtils.isNotBlank(image1)) {
if ("http".equals(image1.substring(0, 4))) {
image = image1;
} else if ("//".equals(image1.substring(0, 2))) {
image = image1;
} else {
String domainName = getDomainName(url);
image = domainName + image1;
}
}
}
String content = "";
if (null != doc.getElementsByAttributeValueContaining("name", "description")) {
content = doc.getElementsByAttributeValueContaining("name", "description").attr("content");
}
String title = doc.getElementsByTag("title").text();
URLContentDTO urlContentDTO = new URLContentDTO();
urlContentDTO.setContent(content);
urlContentDTO.setImage(image);
urlContentDTO.setTitle(title);
return urlContentDTO;
}
//获取URL域名
private static String getDomainName(String url) {
//使用正则表达式过滤,
String re = "((http|ftp|https)://)(([a-zA-Z0-9._-]+)|([0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}))(([a-zA-Z]{2,6})|(:[0-9]{1,4})?)";
String str = "";
// 编译正则表达式
Pattern pattern = Pattern.compile(re);
// 忽略大小写的写法
// Pattern pat = Pattern.compile(regEx, Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(url);
if (matcher.matches()) {
str = url;
} else {
String[] split2 = url.split(re);
if (split2.length > 1) {
String substring = url.substring(0, url.length() - split2[1].length());
str = substring;
} else {
str = split2[0];
}
}
return str;
}
}
三、URLContentDTO(用来返回数据)
package com.example.study.service.DTO;
public class URLContentDTO {
private String title;
private String content;
private String image;
public String getTitle() {return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getImage() {
return image;
}
public void setImage(String image) {
this.image = image;
}
@Override
public String toString() {
return "URLContentDTO{" +
"title='" + title + '\'' +
", content='" + content + '\'' +
", image='" + image + '\'' +
'}';
}
}
测试结果
我们用csdn官网地址测试
结果如下:
注意:某些链接可能没有icon,偶尔取不到图片属于正常现象