Java爬虫爬取网页数据
一.简单介绍爬虫
网络爬虫(Web Crawler),又称为网络蜘蛛(Web Spider)或 Web 信息采集器,是一种按照一定规则,自动抓取或下载网络信息的计算机程序或自动化脚本,是目前搜索引擎的重要组成部分。
我的这个demo 是基于Jsoup做个简单实现java爬虫的
jsoup是一款Java的HTML解析器,主要用来对HTML解析 jsoup 中文官网
二.依赖
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.8.3</version>
</dependency>
<!-- 文件下载 -->
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.5</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.5</version>
</dependency>
三.代码
package com.wxw.springboot;
import org.junit.Assert;
import org.junit.Test;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.HashMap;
import java.util.Map;
public class TestCatchImage {
/**
* 测试携带请求头
*/
@Test
public void testDownloadImageWithHeaders() {
String baiduLogoUrl = "xxxxx.png";
File localFile = new File("D:\\javaStudy\\springboot\\" + "scenery1.png");
Map<String, String> headers = new HashMap<String, String>();
// headers.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
// headers.put("Accept-Encoding", "gzip, deflate");
// headers.put("Accept-Language", "zh-CN,zh;q=0.9,en;q=0.8");
// headers.put("Cache-Control", "max-age=0");
// headers.put("Connection", "keep-alive");
// headers.put("Cookie", "referer=%22https%3A%5C%2F%5C%2Fwww.58pic.com%5C%2Ftupian%5C%2FBMP.html%22; qt_visitor_id=%228d800369135f0ea407efc30c35b855c0%22; qt_uid=0; qt_type=0; user-browser=%22baidu%22; history_search=%22%7B%5C%22BMP_%5C%22%3A%5C%22%5C%5C%5C%2Ftupian%5C%5C%5C%2FBMP.html%5C%22%7D%22; awake=0; qiantudata2018jssdkcross=%7B%22distinct_id%22%3A%2216956b4d4b859f-0f2dc330b57f7c-3a3a5c0e-2073600-16956b4d4b92bd%22%2C%22props%22%3A%7B%22latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%2C%22latest_referrer_host%22%3A%22www.baidu.com%22%2C%22latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%7D%7D; qt_utime=1551937913; FIRSTVISITED=1551937885.528; showAd:8d800369135f0ea407efc30c35b855c0=%22w6SIEgLKiJOIC5HVD3fKoJHKodaWmZy8mtm4zJbLytqWn5vMyZmWyZm4yJG4nwmWiIWIywr5zxj3AxnLCL2Pzci9iJeIlcj3DxjUiJOImsiSiNnOB6DFDgLTzxmIoJmSiMXHC6rFC5HVD423Aw4LiJOXntuXotm6ote3Fv3%3D%22");
// headers.put("Host", "pic.58pic.com");
// headers.put("If-Modified-Since", "Tue, 22 Jul 2014 15:40:16 GMT");
// headers.put("If-None-Match", "ae01753237194fa9a5badf2d90fd20d6");
// headers.put("Referer", "http://image.baidu.com/search/detail?ct=503316480&z=0&ipn=d&word=%E5%9B%BE%E7%89%87&step_word=&hs=0&pn=0&spn=0&di=107250&pi=0&rn=1&tn=baiduimagedetail&is=0%2C0&istype=0&ie=utf-8&oe=utf-8&in=&cl=2&lm=-1&st=undefined&cs=1986179278%2C1118313821&os=1011800134%2C3605107432&simid=3440756675%2C361207036&adpicid=0&lpn=0&ln=1765&fr=&fmq=1552355383195_R&fm=&ic=undefined&s=undefined&hd=undefined&latest=undefined©right=undefined&se=&sme=&tab=0&width=undefined&height=undefined&face=undefined&ist=&jit=&cg=&bdtype=0&oriquery=&objurl=http%3A%2F%2Fpic.58pic.com%2F58pic%2F15%2F68%2F59%2F71X58PICNjx_1024.jpg&fromurl=ippr_z2C%24qAzdH3FAzdH3Fooo_z%26e3Bcbrtv_z%26e3Bv54AzdH3Ffitstwg2p7AzdH3F8cmbcl08_z%26e3Bip4s&gsm=0&rpstart=0&rpnum=0&islist=&querylist=&force=undefined");
headers.put("Referer", "https://www.huzhan.com/");
// headers.put("Referer", "http://image.baidu.com/search/detail?ct=503316480&z=0&ipn=d&word=%E5%9B%BE%E7%89%87&step_word=&hs=0&pn=0&spn=0&di=107250&pi=0&rn=1&tn=baiduimagedetail&is=0%2C0&istype=0&ie=utf-8&oe=utf-8&in=&cl=2&lm=-1&st=undefined&cs=1986179278%2C1118313821&os=1011800134%2C3605107432&simid=3440756675%2C361207036&adpicid=0&lpn=0&ln=1765&fr=&fmq=1552355383195_R&fm=&ic=undefined&s=undefined&hd=undefined&latest=undefined©right=undefined&se=&sme=&tab=0&width=undefined&height=undefined&face=undefined&ist=&jit=&cg=&bdtype=0&oriquery=&objurl=http%3A%2F%2Fpic.58pic.com%2F58pic%2F15%2F68%2F59%2F71X58PICNjx_1024.jpg&fromurl=ippr_z2C%24qAzdH3FAzdH3Fooo_z%26e3Bcbrtv_z%26e3Bv54AzdH3Ffitstwg2p7AzdH3F8cmbcl08_z%26e3Bip4s&gsm=0&rpstart=0&rpnum=0&islist=&querylist=&force=undefined");
// headers.put("Referer", "http://image.baidu.com/search/detail?ct=503316480&z=0&ipn=d&word=%E5%9B%BE%E7%89%87&step_word=&hs=0&pn=0&spn=0&di=107250&pi=0&rn=1&tn=baiduimagedetail&is=0%2C0&istype=0&ie=utf-8&oe=utf-8&in=&cl=2&lm=-1&st=undefined&cs=1986179278%2C1118313821&os=1011800134%2C3605107432&simid=3440756675%2C361207036&adpicid=0&lpn=0&ln=1765&fr=&fmq=1552355383195_R&fm=&ic=undefined&s=undefined&hd=undefined&latest=undefined©right=undefined&se=&sme=&tab=0&width=undefined&height=undefined&face=undefined&ist=&jit=&cg=&bdtype=0&oriquery=&objurl=http%3A%2F%2Fpic.58pic.com%2F58pic%2F15%2F68%2F59%2F71X58PICNjx_1024.jpg&fromurl=ippr_z2C%24qAzdH3FAzdH3Fooo_z%26e3Bcbrtv_z%26e3Bv54AzdH3Ffitstwg2p7AzdH3F8cmbcl08_z%26e3Bip4s&gsm=0&rpstart=0&rpnum=0&islist=&querylist=&force=undefined");
// headers.put("Referer", "http://image.baidu.com/search/detail?ct=503316480&z=0&ipn=d&word=%E5%9B%BE%E7%89%87&step_word=&hs=0&pn=0&spn=0&di=107250&pi=0&rn=1&tn=baiduimagedetail&is=0%2C0&istype=0&ie=utf-8&oe=utf-8&in=&cl=2&lm=-1&st=undefined&cs=1986179278%2C1118313821&os=1011800134%2C3605107432&simid=3440756675%2C361207036&adpicid=0&lpn=0&ln=1765&fr=&fmq=1552355383195_R&fm=&ic=undefined&s=undefined&hd=undefined&latest=undefined©right=undefined&se=&sme=&tab=0&width=undefined&height=undefined&face=undefined&ist=&jit=&cg=&bdtype=0&oriquery=&objurl=http%3A%2F%2Fpic.58pic.com%2F58pic%2F15%2F68%2F59%2F71X58PICNjx_1024.jpg&fromurl=ippr_z2C%24qAzdH3FAzdH3Fooo_z%26e3Bcbrtv_z%26e3Bv54AzdH3Ffitstwg2p7AzdH3F8cmbcl08_z%26e3Bip4s&gsm=0&rpstart=0&rpnum=0&islist=&querylist=&force=undefined");
// headers.put("Upgrade-Insecure-Requests", "1");
// headers.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36");
Assert.assertTrue(downloadImageWithHeaders(baiduLogoUrl, "png", localFile, headers));
}
/**
* 携带头信息下载网络图片
* @param formatName 文件格式名称
* @param localFile 下载到本地文件
* @param headers http协议交互中header信息,如Cookie
* @return 下载是否成功
*/
public static boolean downloadImageWithHeaders(String imageUrl, String formatName, File localFile, Map<String, String> headers) {
boolean isSuccess = false;
InputStream stream = null;
try {
URL url = new URL(imageUrl);
URLConnection conn = url.openConnection();
if (headers != null && !headers.isEmpty()) {
//设置头信息
for (Map.Entry<String, String> entry : headers.entrySet()) {
conn.setRequestProperty(entry.getKey(), entry.getValue());
}
}
conn.setDoInput(true);
stream = conn.getInputStream();
BufferedImage bufferedImg = ImageIO.read(stream);
if (bufferedImg != null) {
isSuccess = ImageIO.write(bufferedImg, formatName, localFile);
} else {
throw new RuntimeException("图片[" + imageUrl + "]下载失败");
}
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (stream != null) {
try {
stream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return isSuccess;
}
}
注意爬取的图片为jpg 输入jpg
四.总结
要在有html元素有的情况下,才能通过jsoup来爬虫,如果是这接口获得的数据,那么通过jsoup是无法获取到的。
java爬虫在爬取网络的数据用到了jsoup中的方法,在下载css,img和创建html就用到了I/O流。