java简单爬虫
通过javaurl实现可以爬去简单视频
爬虫可以爬去网页和图片,使用jsoup作为底层实现
可以进行二次开发
环境:javase,eclipse
jar:jsoup
maven:
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
爬虫对象
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.RandomAccessFile;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import org.jsoup.Connection;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup;
import study.core.Reflex;
/**
* 爬虫对象
* @author Administrator
*@since 0.0.1
*/
public class Reptile {
/**
* 请求头对象
*@since 0.0.2
*/
class Header implements Reflex{
/**
* 身份标识
* @since 0.0.2
*/
private String userAgent;
public String getUserAgent() {
return userAgent;
}
public void setUserAgent(String userAgent) {
this.userAgent = userAgent;
}
}
/**
* 当前上下文对象
*/
protected Reptile context=null;
/**
* 爬虫身份标识
*/
private String userAgent="Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36";
/**
* 设置连接超时
*/
private Integer timeOut=5000;
/**
* 设置读取超时
* @since 0.0.2
*/
private Integer readTimeOut=5000;
/**
* 设置请求类型 默认get
* @since 0.0.2
*/
private String requestType="get";
/**
* 请求头
* @since 0.0.2
*/
private Header header=new Header();
public Header getHeader() {
return header;
}
public void setHeader(Header header) {
this.header = header;
}
/**
* 返回相应码
*/
private Integer code=0;
private Reptile() {
context=this;
}
/**
* 获取爬虫对象
* @return
*/
public static Reptile instance() {
return new Reptile();
}
/**
* 获取相应状态码
* @return
*/
public Integer code() {
return code;
}
/**
* 用户身份标识
* @param userAgent
* @return
*/
public Reptile userAgent(String userAgent) {
this.userAgent=userAgent;
return context;
}
/**
* 设置连接超时
* @param timeOut
* @return
*/
public Reptile timeOut(Integer timeOut) {
this.timeOut=timeOut;
return context;
}
/**
* 设置读取超时
* @param readTimeOut
* @return
* @since 0.0.2
*/
public Reptile readTimeOut(Integer readTimeOut) {
this.readTimeOut=readTimeOut;
return context;
}
/**
* 设置请求类型
* @param type
* @return
* @since 0.0.2
*/
public Reptile requestType(String type) {
this.requestType=type;
return context;
}
/**
*
* @param url
* @param path
* @param type null text 文本
* @return
* @throws Exception
*/
public Object download(final String url, final String path,final String type,String charset) throws Exception {
//url
//判断url是否是http协议
if(url!=null && url.length()<4) {
throw new Exception("");
}
if(url.indexOf("http")==-1) {
throw new Exception("");
}
Connection connect = Jsoup.connect(url);
connect.userAgent(userAgent);
Response response = connect.timeout(timeOut).ignoreContentType(true).execute();
BufferedInputStream inputStream = response.bodyStream();
code = response.statusCode();
if( code==200 && inputStream!=null) {
if(type==null) {
return inputStream;
}else if(type!=null && type.equals("text")){
return Tool.save(path, getContentByInputStream(inputStream, charset), charset);
}else {
return Tool.save(path, inputStream);
}
}
return null;
}
/**
* 通过url获取输入流
* @param url
* @return
* @throws Exception
*/
public InputStream download(final String url) throws Exception {
return (InputStream) context.download(url, "", "", "");
}
/**
* 下载文本
* @param url
* @param path
* @param charset
* @return
* @throws Exception
*/
public File download(final String url,final String path,String charset) throws Exception {
return (File) context.download(url, path, "text", charset);
}
/**
* 下载文件
* @param url
* @param path
* @return
* @throws Exception
*/
public File download(final String url,final String path) throws Exception {
return (File) context.download(url, path, "type","");
}
/**
* 通过输入流获取字符串
* @param inputStream 输入流
* @param charSet 字符集
* @return
* @throws IOException
*/
public synchronized String getContentByInputStream(InputStream inputStream,String charSet) throws IOException {
InputStreamReader inputStreamReader = new InputStreamReader(inputStream,charSet);
char[] buff=new char[1024];
int l=0;
StringBuffer sb=new StringBuffer();
while((l=inputStreamReader.read(buff))!=-1) {
sb.append(buff,0,l);
}
inputStreamReader.close();
return sb.toString();
}
/**
* 内核
* @return
* @throws IOException
* @since 0.0.2
*/
@SuppressWarnings("static-access")
public HttpURLConnection kernel(URL url ) throws Exception {
if(url==null) {
throw new Exception("url为null"+url);
}
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setConnectTimeout(timeOut);
connection.setReadTimeout(readTimeOut);
connection.