JAVA 爬虫 之 北京信件 ---Jsoup

本文介绍了一个使用Java实现的爬虫程序,该程序利用Jsoup和HttpClient库抓取北京市政务网站上的信函信息,并将每一页的内容保存到本地文件中。爬虫通过模拟浏览器行为,发送请求并解析返回的HTML文档。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

package com.spider;

import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
//import org.jsoup.Connection;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.net.URL;
import java.util.HashMap;
import java.util.Map;

/**
 * @Author zhaoxin
 * @Email 1272743926@qq.com
 * @Description //TODO
 * @Date 2018/11/30
 **/
public class BJ {
    public static void main(String[] args) throws Exception {
        for (int j = 1; j < 51; j++) {
            //解析的网址
//            Thread.sleep(2000);
            String path = "http://rexian.beijing.gov.cn/default/com.web.index.moreNewLetterQuery.flow?PageCond/currentPage=" + j + "&type=nextPage";
            Document d1 = conn(path);//转换为Dom树
            Elements list = d1.select("div[class=my_activities_conter mail_conter] ul li p a");
            BufferedWriter bw1 = new BufferedWriter(new FileWriter("spider/comment/第" + j + "页内容" + ".txt"));
            System.out.println("spider/comment/第" + j + "页内容" + ".txt");
            for (int i = 0; i < list.size(); i++) {
                Document d2 = conn("http://rexian.beijing.gov.cn/default/" + list.get(i).attr("href"));
                Elements txt = d2.select("div[class=con_left float_left] div[class=mail_track] span[class=font14 mail_problem]");
                bw1.write("第" + i + "条内容:" + txt.text() + "\n");
            }
            bw1.close();
        }
    }
    public static Document conn(String path) throws IOException {
        Connection connection = Jsoup.connect(path);
        connection.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36");
        connection.header("Cookie", "pgv_pvi=4558768128; pgv_si=s3590505472; JSESSIONID=9DA2CC59CBF23789F93D028092A537A2; pgv_heid=1543563223863.1543567739768.1543567740806.45; accsid=1352; accside=left; accs1352=on; accfirst=0; __jsluid=de167ee5ed3b0e9e198288185efc2902; Hm_lvt_d7682ab43891c68a00de46e9ce5b76aa=1543563192; Hm_lpvt_d7682ab43891c68a00de46e9ce5b76aa=1543563192; _va_ref=%5B%22%22%2C%22%22%2C1543566649%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DGqfnHiLrNdisAae9_OHXPcmctOAkDREUM_djpN3fdtD76GxJXVb6fAkwP9vygdiw%26wd%3D%26eqid%3D85738a5e0000da7f000000045c00f147%22%5D; _va_ses=*; yunsuo_session_verify=b2a47ebec1c011e45e0e1b4f15c42d2b; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216763c43eb19f3-0a2f2c2fbe88b6-5701732-2073600-16763c43eb2363%22%7D; sensorsdata_is_new_user=true; _va_id=e30ca400ca2395fc.1543478402.3.1543567741.1543564410.");
        connection.header("Connection", "keep-alive");
        connection.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
        Connection.Response rs = connection.execute();//获取响应
        Document d1 = Jsoup.parse(rs.body());//转换为Dom树
        return d1;
    }
}
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>

  <groupId>com.spider</groupId>
  <artifactId>spider</artifactId>
  <version>1.0-SNAPSHOT</version>

  <name>spider</name>
  <!-- FIXME change it to the project's website -->
  <url>http://www.example.com</url>
  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <maven.compiler.source>1.7</maven.compiler.source>
    <maven.compiler.target>1.7</maven.compiler.target>
  </properties>

  <dependencies>
    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>4.11</version>
      <scope>test</scope>
    </dependency>
    <dependency>
      <groupId>org.apache.httpcomponents</groupId>
      <artifactId>httpclient</artifactId>
      <version>4.5.2</version>
    </dependency>
    <dependency>
      <groupId>org.jsoup</groupId>
      <artifactId>jsoup</artifactId>
      <version>1.7.3</version>
    </dependency>
    <dependency>
      <groupId>org.junit.jupiter</groupId>
      <artifactId>junit-jupiter-api</artifactId>
      <version>5.0.3</version>
      <scope>compile</scope>
    </dependency>
    <!-- https://mvnrepository.com/artifact/net.sf.json-lib/json-lib -->
    <dependency>
      <groupId>net.sf.json-lib</groupId>
      <artifactId>json-lib</artifactId>
      <version>2.4</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/com.google.code.gson/gson -->
    <dependency>
      <groupId>com.google.code.gson</groupId>
      <artifactId>gson</artifactId>
      <version>2.8.0</version>
    </dependency>
    <dependency>
      <groupId>com.alibaba</groupId>
      <artifactId>fastjson</artifactId>
      <version>1.2.53</version>
    </dependency>
    <dependency>
      <groupId>mysql</groupId>
      <artifactId>mysql-connector-java</artifactId>
      <version>5.1.44</version>
    </dependency>
    <dependency>
      <groupId>com.spider</groupId>
      <artifactId>spider</artifactId>
      <version>1.0-SNAPSHOT</version>
    </dependency>
  </dependencies>
  <build>
    <pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
      <plugins>
        <plugin>
          <artifactId>maven-clean-plugin</artifactId>
          <version>3.0.0</version>
        </plugin>
        <!-- see http://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
        <plugin>
          <artifactId>maven-resources-plugin</artifactId>
          <version>3.0.2</version>
        </plugin>
        <plugin>
          <artifactId>maven-compiler-plugin</artifactId>
          <version>3.7.0</version>
        </plugin>
        <plugin>
          <artifactId>maven-surefire-plugin</artifactId>
          <version>2.20.1</version>
        </plugin>
        <plugin>
          <artifactId>maven-jar-plugin</artifactId>
          <version>3.0.2</version>
        </plugin>
        <plugin>
          <artifactId>maven-install-plugin</artifactId>
          <version>2.5.2</version>
        </plugin>
        <plugin>
          <artifactId>maven-deploy-plugin</artifactId>
          <version>2.8.2</version>
        </plugin>
      </plugins>
    </pluginManagement>
    <plugins>
      <plugin>
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-compiler-plugin</artifactId>
        <configuration>
          <source>6</source>
          <target>6</target>
        </configuration>
      </plugin>
    </plugins>
  </build>
</project>

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值