maven httpclient jsoup爬虫入门（二）

最新推荐文章于 2025-06-25 22:50:55 发布

原创最新推荐文章于 2025-06-25 22:50:55 发布 · 556 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#java基础

爬虫专栏收录该内容

3 篇文章

订阅专栏

对txt操作核心代码

package util;


import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;

public class StoreTxt {

    public static void operateTxt(String url) throws IOException {
        String all="";
        String content = HttpClientUtil.testHttpClient(url);
        Document doc = Jsoup.parse(content); // 解析网页 得到文档对象

        File file = new File("D:\\纣临.txt");
        Elements h1Elements = doc.getElementsByTag("h1"); // 根据tag名称来查询DOM
        Element h1Element = h1Elements.get(0);
        String h1 = h1Element.text();
        System.out.println("题目：" + h1);
        all=all+"书名："+h1+"\r\n";
        Elements authorElements = doc.select("#info p");//作者
        Element authorElement = authorElements.get(0);
        String author = authorElement.text();
        all=all+author+"\r\n";
        System.out.println(author);

        Element introElement = doc.getElementById("intro");// 简介
        String intro = introElement.text();
        System.out.println("简介" + intro);
        all=all+"简介" + intro+"\r\n";

//        addH1AuthorIntro(fileWriter, h1, author, intro);

        Elements hrefElements = doc.select("#list dl dd a");

        for (Element e : hrefElements) {
            String urlIndex = "http://www.biquge.com.tw" + e.attr("href");
            String contentIndex = HttpClientUtil.testHttpClient(urlIndex);
            Document docIndex = Jsoup.parse(contentIndex); // 解析网页 得到文档对象

            // 获取章节名
            Elements chapterElements = docIndex.getElementsByTag("h1"); // 根据tag名称来查询DOM
            Element chapterElement = chapterElements.get(0);
            String chapter = chapterElement.text();

            String textsIndex = docIndex.select("#content").text().replaceAll("\\s+","\r\n");

//            int index  = textsIndex.indexOf(" ");
//            while(index>-1){
//                String line = textsIndex.substring(0,index);
//                textsIndex = textsIndex.substring(index+1);
//                System.out.println(line);
//            }

            all=all+chapter+"\r\n"+textsIndex+"\r\n";
           
        }
        FileWriter fileWriter = new FileWriter(file, true);
        fileWriter.write(all);
        fileWriter.flush();
        fileWriter.close();
        System.out.println(all);
    }
}

httpclient工具类

package util;

import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.io.IOException;

class HttpClientUtil {

    private static CloseableHttpClient httpClient;

    public static  String testHttpClient(String url)  {
        String content= "";
        CloseableHttpResponse response=null;
        try {

            //即将访问的url
//        String url = "http://www.biquge.com.tw/17_17380/";
            //使用默认配置的httpclient
            httpClient = HttpClients.createDefault();

            //执行请求
            response = getResponse(url);

            //打印请求的实体内容 返回json格式
            HttpEntity entity = response.getEntity();
            content=EntityUtils.toString(entity, "GBK");

            response.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return content;
    }

    public static CloseableHttpResponse getResponse(String url) throws IOException {
        HttpGet httpGet = new HttpGet(url);
        CloseableHttpResponse response = httpClient.execute(httpGet);
        return response;
    }
}

main方法入口

import util.StoreTxt;
import java.io.IOException;

public class Main {
    public static void main(String[] args) {

        try {
            StoreTxt.operateTxt("http://www.biquge.com.tw/17_17380/");

        } catch (IOException e) {
            e.printStackTrace();
        }

    }
}

出现问题及解决

对txt操作问题：

fileWriter = new FileWriter(file,true);

txt简写：

        FileWriter fileWriter = new FileWriter(file, true);
        fileWriter.write(all);
        fileWriter.flush();
        fileWriter.close();
        System.out.println(all);

输入txt简写：

String all="";//统一写入后再到txt

应用stringbuffer改进