网络爬虫,是一种规定,自动抓取万维网信息的程序和脚本
入门程序
1.先创建一个maven项目
导入依赖
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.2</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.25</version>
</dependency>
在resources下面建一个log4j.properties
#将等级为DEBUG的日志信息输出到console和file这两个目的地,console和file的定义在下面的代码
log4j.rootLogger=DEBUG,console
#控制台输出的相关设置
log4j.appender.console = org.apache.log4j.ConsoleAppender
log4j.appender.console.Target = System.out
log4j.appender.console.Threshold=DEBUG
log4j.appender.console.layout = org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern= [%c]-%m%n
创建一个maven项目
创建一个fristTest的·类,写代码:
package com.wh;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
public class fristTest {
public static void main(String[] args) throws IOException {
// 1.打开浏览器,创建一个httpClient对象
CloseableHttpClient httpClient =
HttpClients.createDefault();
// 2.输入网址,发起get请求创建httpget对象
HttpGet httpGet = new HttpGet("http://jiaoyu.xiangmu.com/");
// 3.发起请求,使用httpClient对象发起请求
CloseableHttpResponse response = httpClient.execute(httpGet);
// 4.解析响应,获取数据
if (response.getStatusLine().getStatusCode() == 200){
HttpEntity httpEntity = response.getEntity();
String content = EntityUtils.toString(httpEntity,"utf8");
System.out.println(content);
}
}
}
爬虫分为三个部分:采集,处理,存储。。
httpClient–get
package com.wh;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.net.URISyntaxException;
public class HttpGetTest {
public static void main(String[] args) throws Exception {
// 创建httpClient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
// 创建uriBuilder
URIBuilder uriBuilder = new URIBuilder("http://www.vixue.com/");
// uriBuilder.setParameter("keys","news");
uriBuilder.setPath("/news");
// 创建httpGet对象,设置url访问地址
HttpGet httpGet = new HttpGet(uriBuilder.build());
// 使用httpClient发起请求,获取response
System.out.println("*****************************");
System.out.println("请求的信息"+httpGet);
System.out.println("********************************");
CloseableHttpResponse response=null;
try {
response= httpClient.execute(httpGet);
// 解析响应
if(response.getStatusLine().getStatusCode()==200){
String content= EntityUtils.toString(response.getEntity(), "utf8");
// System.out.println(content.length());
System.out.println("****************************");
System.out.println(content.length());
System.out.println("***************************");
}
} catch (IOException e) {
e.printStackTrace();
}finally {
// 关闭response
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
try {
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}