java简单爬虫

最新推荐文章于 2024-05-30 18:25:03 发布

自然_IT

最新推荐文章于 2024-05-30 18:25:03 发布

阅读量351

点赞数

CC 4.0 BY-SA版权

分类专栏： javase 文章标签： java爬虫

本文链接：https://blog.youkuaiyun.com/qq_38943922/article/details/86768999

本文介绍如何利用Java的Jsoup库创建一个简单的爬虫，该爬虫能够抓取网页内容和图片。环境配置为JavaSE和Eclipse，并依赖于jsoup、httpclient等Maven库，适合进行二次开发。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

java简单爬虫

通过javaurl实现可以爬去简单视频
爬虫可以爬去网页和图片，使用jsoup作为底层实现
可以进行二次开发
环境：javase，eclipse
jar：jsoup
maven：

<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.11.3</version>
</dependency>

爬虫对象

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.RandomAccessFile;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;

import org.jsoup.Connection;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup;
import study.core.Reflex;

/**
 * 爬虫对象
 * @author Administrator
 *@since 0.0.1
 */
public class Reptile {
	/**
	 * 请求头对象
	 *@since 0.0.2
	 */
	class Header implements Reflex{
		/**
		 * 身份标识
		 * @since 0.0.2
		 */
		private String userAgent;

		public String getUserAgent() {
			return userAgent;
		}

		public void setUserAgent(String userAgent) {
			this.userAgent = userAgent;
		}
	}
	/**
	 * 当前上下文对象
	 */
	protected Reptile context=null;
	/**
	 * 爬虫身份标识
	 */
	private String userAgent="Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36";
	/**
	 * 设置连接超时
	 */
	private Integer timeOut=5000;
	/**
	 * 设置读取超时
	 * @since 0.0.2
	 */
	private Integer readTimeOut=5000;
	/**
	 * 设置请求类型 默认get
	 * @since 0.0.2
	 */
	private String requestType="get";
	/**
	 * 请求头
	 * @since 0.0.2
	 */
	private Header header=new Header();
	
	public Header getHeader() {
		return header;
	}
	public void setHeader(Header header) {
		this.header = header;
	}
	/**
	 * 返回相应码
	 */
	private Integer code=0;
	private Reptile() {
		context=this;
	}
	/**
	 * 获取爬虫对象
	 * @return
	 */
	public static Reptile instance() {
		return new Reptile();
	}
	/**
	 * 获取相应状态码
	 * @return
	 */
	public Integer code() {
		return code;
	}
	/**
	 * 用户身份标识
	 * @param userAgent
	 * @return
	 */
	public Reptile userAgent(String userAgent) {
		this.userAgent=userAgent;
		return context;
	}
	/**
	 * 设置连接超时
	 * @param timeOut
	 * @return
	 */
	public Reptile timeOut(Integer timeOut) {
		this.timeOut=timeOut;
		return context;
	}
	/**
	 * 设置读取超时
	 * @param readTimeOut
	 * @return
	 * @since 0.0.2
	 */
	public Reptile readTimeOut(Integer readTimeOut) {
		this.readTimeOut=readTimeOut;
		return context;
	}
	/**
	 * 设置请求类型
	 * @param type
	 * @return
	 * @since 0.0.2
	 */
	public Reptile requestType(String type) {
		this.requestType=type;
		return context;
	}
	/**
	 * 
	 * @param url
	 * @param path
	 * @param type null text 文本
	 * @return
	 * @throws Exception
	 */
	public Object download(final String url, final String path,final String type,String charset) throws Exception {
		//url
		//判断url是否是http协议
		if(url!=null && url.length()<4) {
			throw new Exception("");
		}
		if(url.indexOf("http")==-1) {
			throw new Exception("");
		}
		Connection connect = Jsoup.connect(url);
		connect.userAgent(userAgent);
		Response response = connect.timeout(timeOut).ignoreContentType(true).execute();
		BufferedInputStream inputStream = response.bodyStream();
		code = response.statusCode();
		if( code==200 && inputStream!=null) {
			if(type==null) {
				return inputStream;
			}else if(type!=null && type.equals("text")){
				return Tool.save(path, getContentByInputStream(inputStream, charset), charset);
			}else {
				return Tool.save(path, inputStream);
			}
		}
		return null;
	}
	/**
	 * 通过url获取输入流
	 * @param url
	 * @return
	 * @throws Exception
	 */
	public InputStream download(final String url) throws Exception {
		return (InputStream) context.download(url, "", "", "");
	}
	/**
	 * 下载文本
	 * @param url 
	 * @param path
	 * @param charset
	 * @return
	 * @throws Exception
	 */
	public File download(final String url,final String path,String charset) throws Exception {
		return (File) context.download(url, path, "text", charset);
	}
	/**
	 * 下载文件
	 * @param url
	 * @param path
	 * @return
	 * @throws Exception
	 */
	public File download(final String url,final String path) throws Exception {
		return (File) context.download(url, path, "type","");
	}
	/**
	 * 通过输入流获取字符串
	 * @param inputStream 输入流
	 * @param charSet 字符集
	 * @return
	 * @throws IOException
	 */
	public synchronized String getContentByInputStream(InputStream inputStream,String charSet) throws IOException {
		InputStreamReader inputStreamReader = new InputStreamReader(inputStream,charSet);
		char[] buff=new char[1024];
		int l=0;
		StringBuffer sb=new StringBuffer();
		while((l=inputStreamReader.read(buff))!=-1) {
			sb.append(buff,0,l);
		}
		inputStreamReader.close();
		
		return sb.toString();
	}
	/**
	 * 内核
	 * @return 
	 * @throws IOException 
	 * @since 0.0.2
	 */
	@SuppressWarnings("static-access")
	public HttpURLConnection kernel(URL url ) throws Exception {
		if(url==null) {
			throw new Exception("url为null"+url);
		}
		HttpURLConnection connection = (HttpURLConnection) url.openConnection();
		connection.setConnectTimeout(timeOut);
		connection.setReadTimeout(readTimeOut);
		connection.