PaChong（不要被封）_pachong moban-优快云博客

本文链接：https://blog.youkuaiyun.com/weixin_52528062/article/details/127646487

本文介绍了一个具体的爬虫应用案例，包括如何使用Java进行网页抓取、解析HTML源码以提取图片链接，并下载指定图片到本地的过程。同时涉及了HTTP协议的基础使用及正则表达式的图片链接匹配。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

1.Socket，多线程，
2.继承，super，this
this.属性
this.方法
this.()：调用本类构造方法，但是只能在构造方法中使用
super.属性名
super.方法名
super()：调用父类构造方法，但是只能写在第一行
3.多线程
1.父类的引用类型变量指向了类的实例化对象，类的多态，继承的多态
2.父类的引用类型变量指向了接口的实现类对象，接口的多态，实现的多态
对一个引用有多种表现形式，借由继承和实现来扩展功能。
A a = new A ()
A cxtcnds/implcmcnts B
B b = new A ()

开头保命

package pachong2;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.InetSocketAddress;
import java.net.Socket;
import java.net.URL;
import java.net.URLConnection;
import java.net.UnknownHostException;
import java.util.List;

import com.util.ImgUtil;

/*
 * 爬虫是客户端
 * 1.先发送消息
 * 2.在接受消息
 */

/*
 * 端口：http----80
 * 		https----443
 */
public class ImageSorce {

	static final String path = "C:\\img";

	public static void main(String[] args) throws IOException {

		// 建立连接
		URL url1 = new URL("http://www.cssmoban.com/");
		HttpURLConnection httpUrlConn = (HttpURLConnection) url1.openConnection();
		httpUrlConn.setDoInput(true);
		httpUrlConn.setRequestMethod("GET");
		httpUrlConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64)");
		// 获取输入流
		InputStream input = httpUrlConn.getInputStream();
//		     OutputStream output = httpUrlConn.getOutputStream();
		// 将字节输入流转换为字符输入流
		InputStreamReader read = new InputStreamReader(input, "utf-8");
		// 为字符输入流添加缓冲
		BufferedReader br = new BufferedReader(read);
		// 读取返回结果
		StringBuilder sb = new StringBuilder();
		String data = br.readLine();
		while (data != null) {
//						System.out.println(data);
			data = br.readLine();
			sb.append(data);
		}

//	System.out.println(sb);

		System.out.println("数据接收完毕");

		// 5sb就是我们的响应协议
		String respone = sb.toString();
		// 6,通过正则表达式获取所有的图pain地址
		List<String> list = ImgUtil.getImgSrc(respone);

		// 判断路径中是否包含UploadFiles，如果有，则就是要爬取的
		for (String src_path : list) {
			URL url = null;
			URLConnection conn = null;
			if (src_path != null && !src_path.equals("") && src_path.contains("UploadFiles")) {
				// 获取文件名
				int index = src_path.lastIndexOf("/");// 最后一个斜杠出现的位置
				String src_name = src_path.substring(index);// 文件名
				System.out.println("开始爬取" + src_name);

				// 开始爬取。
				// 通过URLConnection = socket+支 持http协议
				// HttpClient = socket+. 支持https协仪
				url = new URL(src_path);
				conn = url.openConnection();
				InputStream img_input = conn.getInputStream();
				BufferedInputStream bis = new BufferedInputStream(img_input);// 图片的输入流

				byte[] img_byte = new byte[1024];// 用于存取图片的字节数组
				int img_len = 0;// 每次读取图片的长度

				// 要先确定文件的路径名
				String img_path = path + "\\" + src_name;
				File file = new File(img_path);
				if (!file.exists()) {// 判断文件是否存在
					file.createNewFile();// 创建以file为路径命名的空文件
				}

				FileOutputStream fileOutputStream = new FileOutputStream(file);
				BufferedOutputStream bos = new BufferedOutputStream(fileOutputStream);
				while ((img_len = bis.read(img_byte)) != -1) {
					// 写到本地
					// 要先确定文件的路径名
					bos.write(img_byte, 0, img_len);
					bos.flush();
				}
				System.out.println(src_name + "寫入成功");
				// 关闭资源
				fileOutputStream.close();
				bos.close();
			}
		}

	}

}

发pachong文章尽然会被封，伪装一下。

package com.util;

import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.UUID;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class ImgUtil {

	//正则表达式获取图片 地址
	public static List<String> getImgSrc(String content) {
		List<String> list = new ArrayList<String>();
		// 目前img标签标示有3种表达式
		// <img alt="" src="1.jpg"/> <img alt="" src="1.jpg"></img> <img alt="" src="1.jpg">
		// 开始匹配content中的<img />标签
		Pattern p_img = Pattern.compile("<(img|IMG)(.*?)(/>|></img>|>)");
		Matcher m_img = p_img.matcher(content);
		boolean result_img = m_img.find();
		if (result_img) {
			while (result_img) {
				// 获取到匹配的<img />标签中的内容
				String str_img = m_img.group(2);

				// 开始匹配<img />标签中的src
				Pattern p_src = Pattern.compile("(src|SRC)=(\"|')(.*?)(\"|')");
				Matcher m_src = p_src.matcher(str_img);
				if (m_src.find()) {
					String str_src = m_src.group(3);
					list.add(str_src);
				}
				// 结束匹配<img />标签中的src

				// 匹配content中是否存在下一个<img />标签，有则继续以上步骤匹配<img />标签中的src
				result_img = m_img.find();
			}
		}
		return list;
	}

	//下载图片的工具类
	private static void downloadImg(String url) throws Exception {
		String path = "C:/pipeline/moban/";
		File dir = new File(path);
		// 目录不存在则创建目录
		if (!dir.exists()) {
			dir.mkdirs();
		}
		// 获取扩展名
		String realExt = url.substring(url.lastIndexOf("."));
		String fileName = UUID.randomUUID().toString().replace("-", "") + realExt;
		fileName = fileName.replace("-", "");
		String filePath = path + fileName;
		File img = new File(filePath);
		if (img.exists()) {
			// 若文件之前已经下载过，则跳过
			System.out.println(String.format("No.<%s>", fileName));
			return;
		}

		URLConnection con = new URL(url).openConnection();
		con.setConnectTimeout(10000);
		con.setReadTimeout(10000);

		InputStream is = con.getInputStream();
		byte[] bs = new byte[2048];
		FileOutputStream os = new FileOutputStream(img, true);
		// 开始读取 写入
		int len;
		while ((len = is.read(bs)) != -1) {
			os.write(bs, 0, len);
			os.flush();
		}
	}
}