java批量获取百度图片

原创于 2022-03-08 09:15:26 发布 · 3.1k 阅读

CC 4.0 BY-SA版权

文章标签：

本文介绍如何使用Java实现百度图片的爬取。通过Apache HttpClient库，设置特定URL参数如queryWord和pn，实现分页抓取图片链接。然后解析JSON响应，提取图片URL，并下载到本地。代码中包含了错误处理和超时配置，适用于学习网络爬虫和图片下载。

java获取百度图片学习记录

主要用到的jar包

        <dependency>
			<groupId>org.apache.httpcomponents</groupId>
			<artifactId>httpclient</artifactId>
		</dependency>

分析百度加载url,每次请求加载30张图片，主要有参数pn控制,gsm为pn的16进制，最后是时间戳，搜索内容有queryWord控制

https://image.baidu.com/search/acjson?tn=resultjson_com&logid=7312553249612451609&ipn=rj&ct=201326592&is=&fp=result&fr=&word=%E7%BE%8E%E6%99%AF&queryWord=%E7%BE%8E%E6%99%AF&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&hd=&latest=&copyright=&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&expermode=&nojc=&isAsync=&pn=30&rn=30&gsm=1e&1646700092850=

可以写一个实体类用来设置url

public class PageUrl {

	private String queryWord; 
	private int pn; 
	private String gsm; 

	public PageUrl(String queryWord, int pn) {
		// 对查询关键字进行url编码
		try {
			this.queryWord = URLEncoder.encode(queryWord, "UTF-8");
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		}
		this.pn = pn;
		this.gsm = Integer.toHexString(pn);
	}

	// pn 和 gsm 应该是一起变化的
	public void setPn(int pn) {
		this.pn = pn;
		this.gsm = Integer.toHexString(pn);
	}

	@Override
	public String toString() {
		return "http://image.baidu.com/search/acjsontn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord="
				+ queryWord + "&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=&hd=&latest=&copyright=&word="
                + queryWord + "&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&pn="                                                          
                + pn+ "&rn=30&gsm=" + gsm + "&" + new Date().getTime() + "=";
	}
}

编写实现类

public class PicSpider {

	public void crawlePicture(String queryWord, int page) throws Exception {
		if (page < 1) {
			throw new Exception("page set error.");
		}
		PageUrl pageUrl = new PageUrl(queryWord, 30); /
		for (int i = 1; i <= page; i++) {
			pageUrl.setPn(i * 30);
			getJson(pageUrl.toString());
		}
	}
	public String getJson(String url) {
		String json = null;
		try (CloseableHttpClient httpClient = HttpClients.createDefault()) {
			HttpGet getMethod = new HttpGet(url);
			setHeaders(getMethod);
			try (CloseableHttpResponse response = httpClient.execute(getMethod)) {
				int statusCode = response.getStatusLine().getStatusCode();
				if (statusCode == HttpStatus.SC_OK) {
					HttpEntity entity = response.getEntity();
					if (entity != null) {
						json = EntityUtils.toString(entity, "UTF-8");
						resolveJson(json);
					}
				} else {
					throw new IOException("请求失败：" + statusCode);
				}
			}
		} catch (IOException e) {
			e.printStackTrace();
		}
		return json;
	}

	public List<String> resolveJson(String json) {
		// 使用正则表达式，进行匹配，获取 objURL
		String regx = "\"thumbURL\":\"(.*?)\",";
		Pattern p = Pattern.compile(regx);
		Matcher m = p.matcher(json);
		List<String> strs = new LinkedList<>();
		while (m.find()) {
			strs.add(m.group(0));
		}
		// 使用 Stream API 进行处理并返回。
		return strs.stream().map(s -> s.substring(12, s.length() - 2)).collect(Collectors.toList());
	}
	public void download(List<String> urlList) {
		// 用于统计一些数据
		AtomicInteger successCount = new AtomicInteger(0), failCount = new AtomicInteger(0),
				exceptionCount = new AtomicInteger(0);
		// 设置超时时间
		RequestConfig config = RequestConfig.custom().setSocketTimeout(10 * 1000).setConnectTimeout(10 * 1000)
				.setConnectionRequestTimeout(10 * 1000).setRedirectsEnabled(false) // 不允许自动重定向，否则会把html页面当成图片下载下来
				.build();
		try (CloseableHttpClient httpClient = HttpClients.createDefault()) {
			urlList.forEach(url -> {
				HttpGet getMethod = new HttpGet(url);
				try (CloseableHttpResponse response = httpClient.execute(getMethod)) {
					int statusCode = response.getStatusLine().getStatusCode();
					if (statusCode == HttpStatus.SC_OK) {
						HttpEntity entity = response.getEntity();
						if (entity != null) {
							String filename = this.getFileName(url);
							File file = new File("F:/baiduImage/" + filename);
							if (!file.exists()) {
								file.getParentFile().mkdirs();
							}
							try (OutputStream out = new BufferedOutputStream(new FileOutputStream(file))) {
								entity.writeTo(out);
								successCount.getAndIncrement();
								System.out.println(statusCode + " success: " + url + "\n" + filename);
							}
						}
					} else {
						failCount.getAndIncrement();
						System.out.println(statusCode + " fail: " + url);
					}
				} catch (IOException e) {
					e.printStackTrace();
					exceptionCount.getAndIncrement();
					System.out.println("IOException: " + url);
				}
			});
		} catch (IOException e1) {
			e1.printStackTrace();
		}
		System.out.println("statistic data[ " + "Success: " + successCount.get() + "\n" + "Fail: " + failCount.get()
				+ "\n" + "Exception: " + exceptionCount.get() + " ]");
	}

	private String getFileName(String url) {
		String suffix = url.substring(url.lastIndexOf("/") + 1);
		if (suffix.contains("?")) {
			suffix = suffix.split("[?]")[0]; // 这个 ? ，不能直接使用，必须转义一下
		}
		// 后缀默认就是 jpeg
		suffix = -1 != suffix.lastIndexOf(".") ? suffix.substring(suffix.lastIndexOf(".")) : ".jpeg";
		return UUID.randomUUID().toString() + suffix;
	}
	public static void setHeaders(HttpGet get) {
		get.setHeader("Accept",				"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9");
		get.setHeader("Accept-Encoding", "gzip, deflate, br");
		get.setHeader("Cache-Control", "max-age=0");
		get.setHeader("Connection", "keep-alive");
		get.setHeader("Cookie","自己登录的cookie信息");// **自己登录的cookie信息**
		get.setHeader("Host", "image.baidu.com");
		get.setHeader("Upgrade-Insecure-Requests", "1");
		get.setHeader("User-Agent",
				"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36");
	}
}

编写main方法获取图片

public static void main(String[] args) throws Exception {
		PicSpider spider = new PicSpider();
		List<String> urls=spider.crawlePicture( "高清美景", 5);
		download(urls);
	}

跑起来在F:/baiduImage/文件夹可以找到下载的图片；
仅供学习使用