java“简单科技”爬虫实现某*客访问量,30行代码搞定
java爬虫环境
- idea
- maven
- HTTPClient
Maven工程pom.xml必要坐标依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>xjit</groupId>
<artifactId>crawler</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.13</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.slf4j/slf4j-log4j12 -->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.25</version>
<scope>test</scope>
</dependency>
</dependencies>
<properties>
<java.version>11</java.version>
<maven.compiler.source>${java.version}</maven.compiler.source>
<maven.compiler.target>${java.version}</maven.compiler.target>
</properties>
</project>
爬虫核心代码
数组中的地址为需要访问的文章的地址,程序为什么需要休眠60秒呢?因为同一ip地址在短时间内访问同一地址是无效的,所以需要休眠,经测试,60秒的休眠是刚好符合的。奈何技术有限,动态ip访问不会只能通过休眠的方式解决,欢迎大佬用其它方案实现
package test;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.util.List;
public class Crawler {
public static void main(String[] args) throws IOException, InterruptedException {
String[] arr = new String[7];
arr[6]="https://blog.youkuaiyun.com/cm_mc_cm_mc/article/details/118972740?spm=1001.2014.3001.5502";
arr[5]="https://blog.youkuaiyun.com/cm_mc_cm_mc/article/details/118942544?spm=1001.2014.3001.5502";
arr[4]="https://blog.youkuaiyun.com/cm_mc_cm_mc/article/details/118941964?spm=1001.2014.3001.5502";
arr[3]="https://blog.youkuaiyun.com/cm_mc_cm_mc/article/details/118893364?spm=1001.2014.3001.5502";
arr[2]="https://blog.youkuaiyun.com/cm_mc_cm_mc/article/details/118812979?spm=1001.2014.3001.5502";
arr[1]="https://blog.youkuaiyun.com/cm_mc_cm_mc/article/details/118722597?spm=1001.2014.3001.5502";
arr[0]="https://blog.youkuaiyun.com/cm_mc_cm_mc/article/details/118722597?spm=1001.2014.3001.5502";
HttpGet httpGet = null;
//创建HttpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
for (int num = 0; num < 100; num++) {
for (int i = 0; i<7;i++){
//发起get请求的创建HttpGet对象
httpGet = new HttpGet(arr[i]);
//使用HttpClient对象发起请求
CloseableHttpResponse response = httpClient.execute(httpGet);
//判断响应是否成功
if (response.getStatusLine().getStatusCode() == 200){
HttpEntity httpEntity = response.getEntity();
String content = EntityUtils.toString(httpEntity, "utf8");
System.out.println(content);
}
Thread.sleep(1000);
}
Thread.sleep(1000*60);
}
}
}