识别URL内容(爬取标题，内容，icon)_通过url路径识别内容-优快云博客

本文链接：https://blog.youkuaiyun.com/weixin_45201611/article/details/109075218

本文介绍了一种实现，通过Controller层与Service层配合，利用Jsoup抓取URL的标题、内容和图标。以优快云官网为例，展示了如何构造URLContentDTO并处理特殊情况。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

爬取url标题，内容，icon

前言
一、Controller层
二、Service层
三、URLContentDTO（用来返回数据）
测试结果

前言

前几天接个需求，用户输入文章链接后，自动显示出链接的标题，内容和icon，下面是具体实现。

一、Controller层

package com.example.study.web.rest;

import com.example.study.service.DTO.URLContentDTO;
import com.example.study.service.GetHTMLService;
import io.swagger.annotations.Api;
import io.swagger.annotations.ApiOperation;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;

@RestController
@RequestMapping("/html")
public class HTMLResource {

    private final GetHTMLService getHTMLService;

    public HTMLResource(GetHTMLService getHTMLService) {
        this.getHTMLService = getHTMLService;
    }
    @ApiOperation("2.识别URL内容(爬取标题，内容，icon)")
    @GetMapping("/get-content")
    public ResponseEntity<URLContentDTO> getURLContent(@RequestParam(name = "url") String url) throws Exception {
        URLContentDTO urlContentDTO = getHTMLService.getContent(url);
        return ResponseEntity.ok(urlContentDTO);
    }
}

二、Service层

package com.example.study.service;

import com.example.study.service.DTO.URLContentDTO;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.springframework.stereotype.Service;

import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

@Service
public class GetHTMLService {

    public static URLContentDTO getContent(String url) throws IOException {
        Document doc = Jsoup.connect(url).get();

        String image = "";
        if (null != doc.getElementsByAttributeValueContaining("rel", "shortcut icon")) {
            String image1 = doc.getElementsByAttributeValueContaining("rel", "shortcut icon").attr("href");

            if (StringUtils.isNotBlank(image1)) {
                if ("http".equals(image1.substring(0, 4))) {
                    image = image1;
                } else if ("//".equals(image1.substring(0, 2))) {
                    image = image1;
                } else {
                    String domainName = getDomainName(url);
                    image = domainName + image1;
                }
            }
        }

        String content = "";
        if (null != doc.getElementsByAttributeValueContaining("name", "description")) {
            content = doc.getElementsByAttributeValueContaining("name", "description").attr("content");
        }

        String title = doc.getElementsByTag("title").text();

        URLContentDTO urlContentDTO = new URLContentDTO();
        urlContentDTO.setContent(content);
        urlContentDTO.setImage(image);
        urlContentDTO.setTitle(title);
        return urlContentDTO;
    }

    //获取URL域名
    private static String getDomainName(String url) {
        //使用正则表达式过滤，
        String re = "((http|ftp|https)://)(([a-zA-Z0-9._-]+)|([0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}))(([a-zA-Z]{2,6})|(:[0-9]{1,4})?)";
        String str = "";
        // 编译正则表达式
        Pattern pattern = Pattern.compile(re);
        // 忽略大小写的写法
        // Pattern pat = Pattern.compile(regEx, Pattern.CASE_INSENSITIVE);
        Matcher matcher = pattern.matcher(url);
        if (matcher.matches()) {
            str = url;
        } else {
            String[] split2 = url.split(re);
            if (split2.length > 1) {
                String substring = url.substring(0, url.length() - split2[1].length());
                str = substring;
            } else {
                str = split2[0];
            }
        }
        return str;
    }

}

三、URLContentDTO（用来返回数据）

package com.example.study.service.DTO;

public class URLContentDTO {

    private String title;
    private String content;
    private String image;

    public String getTitle() {return title;
    }
    public void setTitle(String title) {
        this.title = title;
    }
    public String getContent() {
        return content;
    }
    public void setContent(String content) {
        this.content = content;
    }
    public String getImage() {
        return image;
    }
    public void setImage(String image) {
        this.image = image;
    }
    @Override
    public String toString() {
        return "URLContentDTO{" +
            "title='" + title + '\'' +
            ", content='" + content + '\'' +
            ", image='" + image + '\'' +
            '}';
    }
}