maven httpclient jsoup爬虫入门(二)

对txt操作核心代码

package util;


import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;

public class StoreTxt {

    public static void operateTxt(String url) throws IOException {
        String all="";
        String content = HttpClientUtil.testHttpClient(url);
        Document doc = Jsoup.parse(content); // 解析网页 得到文档对象

        File file = new File("D:\\纣临.txt");
        Elements h1Elements = doc.getElementsByTag("h1"); // 根据tag名称来查询DOM
        Element h1Element = h1Elements.get(0);
        String h1 = h1Element.text();
        System.out.println("题目:" + h1);
        all=all+"书名:"+h1+"\r\n";
        Elements authorElements = doc.select("#info p");//作者
        Element authorElement = authorElements.get(0);
        String author = authorElement.text();
        all=all+author+"\r\n";
        System.out.println(author);

        Element introElement = doc.getElementById("intro");// 简介
        String intro = introElement.text();
        System.out.println("简介" + intro);
        all=all+"简介" + intro+"\r\n";

//        addH1AuthorIntro(fileWriter, h1, author, intro);

        Elements hrefElements = doc.select("#list dl dd a");

        for (Element e : hrefElements) {
            String urlIndex = "http://www.biquge.com.tw" + e.attr("href");
            String contentIndex = HttpClientUtil.testHttpClient(urlIndex);
            Document docIndex = Jsoup.parse(contentIndex); // 解析网页 得到文档对象

            // 获取章节名
            Elements chapterElements = docIndex.getElementsByTag("h1"); // 根据tag名称来查询DOM
            Element chapterElement = chapterElements.get(0);
            String chapter = chapterElement.text();

            String textsIndex = docIndex.select("#content").text().replaceAll("\\s+","\r\n");

//            int index  = textsIndex.indexOf(" ");
//            while(index>-1){
//                String line = textsIndex.substring(0,index);
//                textsIndex = textsIndex.substring(index+1);
//                System.out.println(line);
//            }

            all=all+chapter+"\r\n"+textsIndex+"\r\n";
           
        }
        FileWriter fileWriter = new FileWriter(file, true);
        fileWriter.write(all);
        fileWriter.flush();
        fileWriter.close();
        System.out.println(all);
    }
}


httpclient工具类

package util;

import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.io.IOException;

class HttpClientUtil {

    private static CloseableHttpClient httpClient;

    public static  String testHttpClient(String url)  {
        String content= "";
        CloseableHttpResponse response=null;
        try {

            //即将访问的url
//        String url = "http://www.biquge.com.tw/17_17380/";
            //使用默认配置的httpclient
            httpClient = HttpClients.createDefault();

            //执行请求
            response = getResponse(url);

            //打印请求的实体内容 返回json格式
            HttpEntity entity = response.getEntity();
            content=EntityUtils.toString(entity, "GBK");

            response.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return content;
    }

    public static CloseableHttpResponse getResponse(String url) throws IOException {
        HttpGet httpGet = new HttpGet(url);
        CloseableHttpResponse response = httpClient.execute(httpGet);
        return response;
    }
}


main方法入口

import util.StoreTxt;
import java.io.IOException;

public class Main {
    public static void main(String[] args) {

        try {
            StoreTxt.operateTxt("http://www.biquge.com.tw/17_17380/");

        } catch (IOException e) {
            e.printStackTrace();
        }

    }
}

出现问题及解决

对txt操作问题:

fileWriter = new FileWriter(file,true);

txt简写:

        FileWriter fileWriter = new FileWriter(file, true);
        fileWriter.write(all);
        fileWriter.flush();
        fileWriter.close();
        System.out.println(all);
输入txt简写:
String all="";//统一写入后再到txt
应用stringbuffer改进



使用Java开源工具HttpClientJsoup可以高效实现网页抓取与结构化数据提取,适用于爬虫开发、数据分析等场景,其使用步骤及示例如下: ### 引入依赖 在Maven项目的`pom.xml`文件中添加以下依赖: ```xml <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.8.3</version> </dependency> ``` 上述是JSoup 1.8.3版本的依赖配置示例[^2]。 ### HttpClientJsoup结合使用示例代码 以下是一个简单的使用HttpClient发送请求获取网页内容,再使用Jsoup解析内容的示例代码: ```java import org.apache.http.HttpEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.IOException; public class HttpClientJsoupExample { public static void main(String[] args) { // 创建HttpClient实例 CloseableHttpClient httpClient = HttpClients.createDefault(); // 创建HttpGet请求 HttpGet httpGet = new HttpGet("https://example.com"); // 替换为实际的URL try { // 执行请求 CloseableHttpResponse response = httpClient.execute(httpGet); // 获取响应实体 HttpEntity entity = response.getEntity(); if (entity != null) { // 将实体内容转换为字符串 String html = EntityUtils.toString(entity); // 使用Jsoup解析HTML字符串 Document doc = Jsoup.parse(html); // 使用选择器语法查找元素 Elements links = doc.select("a[href]"); for (Element link : links) { System.out.println("Link: " + link.attr("href")); System.out.println("Text: " + link.text()); } } // 关闭响应 response.close(); } catch (IOException e) { e.printStackTrace(); } finally { try { // 关闭HttpClient httpClient.close(); } catch (IOException e) { e.printStackTrace(); } } } } ``` 上述代码展示了如何使用HttpClient发送GET请求获取网页内容,然后使用Jsoup解析HTML内容并查找所有链接元素。 ### 其他使用方法 - **HttpClient用法**:包括GET请求(无参、带参)、POST请求(无参、带参)、连接池请求、request的相关配置等。例如GET无参请求: ```java CloseableHttpClient httpClient = HttpClients.createDefault(); HttpGet httpGet = new HttpGet("https://example.com"); CloseableHttpResponse response = httpClient.execute(httpGet); ``` - **Jsoup用法**:可以解析URL、字符串、文件,使用dom方式遍历文档,使用选择器语法查找元素等。例如解析URL: ```java Document doc = Jsoup.connect("https://example.com").get(); ```
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值