Java爬虫的简易实现，爬取深度可控

最新推荐文章于 2025-02-08 17:38:12 发布

Erlnesa

最新推荐文章于 2025-02-08 17:38:12 发布

阅读量2.9k

点赞数 5

分类专栏： Java 文章标签： Java 爬虫

本文链接：https://blog.youkuaiyun.com/qq_43011305/article/details/85948356

版权

Java 专栏收录该内容

4 篇文章

订阅专栏

本文介绍了一种使用Java实现的简易爬虫，通过引用其他大佬的方法，可以控制爬取深度。提供了程序的功能说明、源码及下载地址，方便读者学习和使用。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

使用了大佬们的方法，原文链接

链接：https://blog.youkuaiyun.com/yes7849/article/details/78121112
链接：https://www.cnblogs.com/sanmubird/p/7857474.html

效果图

带有简单的UI界面

功能说明

指定入口Url和爬取深度
从指定的Url里开始解析网页源码，寻找网页链接放入Links类的待访问列表
获取Links的待访问列表，对每一个待访问Url分别新建一个线程去下载内部图片，将已访问的Url加入Links的已访问列表
清空Links的待访问列表
将本次已经访问过的Url进行解析，解析到的新Url放入Links的待访问列表
重复这个过程直到完成指定深度

源码

主类，程序入口，用来获取新的Url

package com.Pic_Download;

import java.awt.AWTException;
import java.awt.Robot;
import java.io.File;
import java.util.LinkedList;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

class Url_ implements Runnable {
	//这个是pic下载线程
	private Thread t;
	private String threadName;
	private String Url;
	Url_( String name,String Url) {
		threadName = name;
		this.Url = Url;
	}
	public void run(){
		Url_Pic_Download a = new Url_Pic_Download();
		Url_Pic_Download.filename = threadName;
		Url_Pic_Download.downloadimg(Url);
   	}
	public void start () {
		if (t == null) {
			t = new Thread (this, threadName);
	        t.start ();
	    }
	}
}


public class Get_Url {
	private static Links url_link = new Links();

	public static void main(String args[]) {
		System.out.println();
		System.out.println();
		System.out.println("图片根目录(不存在会自动创建)D:/picture");
		
		Scanner scanner = new Scanner(System.in);
		Url_Download a = new Url_Download();
		
		System.out.print("指定入口Url(默认为http://www.mmonly.cc/mmtp/nymn/):");
		String inURL = scanner.next();
		if(inURL.equals("")||inURL == null){
			inURL = "http://www.mmonly.cc/mmtp/nymn/";
		}
		
		int Sd = 1;//默认访问深度1层
		System.out.print("设置访问深度(默认为1):");
		Sd = scanner.nextInt();
		
		File Gml=new File("D:/picture");
		if(!Gml.exists()){//如果文件夹不存在
			System.out.println("创建根文件夹D:/picture");
			Gml.mkdir();//创建文件夹
		}
		
		
		//开始解析入口Url
		a.setPageUrl(inURL);
		
		String text = a.getPageSource();
		
		String regex="[a-zA-z]+://[^\\s]*\\.+(html)";//匹配所有.html结尾的Url链接
		
	    Pattern pt=Pattern.compile(regex);
	    Matcher mt=pt.matcher(text);
	    
	    for(int i = 0;mt.find();i++){
	    	String url = mt.group();
	    	url_link.addUnvisitedUrlQueue(url);//添加到待访问的集合
	    }
	    
	    LinkedList link = url_link.getUnVisitedUrlQueue();//获取待访问的 url 集合
	    
	    for(int sdkz = 1;sdkz <= Sd;sdkz++){
	    
		    link = url_link.getUnVisitedUrlQueue();//获取待访问的 url 集合

		    System.out.println("link元素数量:"+link.size());
		    for(int i = 0;i <= link.size() - 1;i++){
		    	
		    	//解析文件夹名
		    	String[] filesname = link.get(i).toString().split("/");
				String[] name = filesname[filesname.length - 1].split("\\.");
		    	
		    	
		    	File file=new File("D:\\picture\\"+name[0]);
				if(!file.exists()){//如果文件夹不存在
					file.mkdir();//创建文件夹
				}
				
				url_link.addVisitedUrlSet(link.get(i).toString());//添加到访问过的url集合
				
				Url_ urldown = new Url_(name[0],link.get(i).toString());//url内图片下载
		    	urldown.start();
		    	
				try {
					Robot r = new Robot();
			        r.delay(1000);//延时1秒继续，防止大量占用目标服务器资源
				} catch (AWTException e) {
					System.err.println("延时器错误");
				}
		    }

		    String[] U = new String[link.size()];
		    for(int i = 0;i <= link.size() - 1;i++){//对当前深度的Url进行记录准备再解析
		    	U[i] = link.get(i).toString();
		    }
		    while(!url_link.unVisitedUrlQueueIsEmpty()){//删除全部待访问url
		    	url_link.removeHeadOfUnVisitedUrlQueue();
		    }

		    //对待访问的url进行再解析
		    for(int i = 0;i <= U.length - 1;i++){//把上一次访问过的url再解析
		    	//入口
				a.setPageUrl(U[i]);
				
				text = a.getPageSource();
				
			    mt=pt.matcher(text);
			    
			    for(int j = 0;mt.find();j++){
			    	String url = mt.group();
			    	url_link.addUnvisitedUrlQueue(url);//添加到待访问的集合
			    }
		    }
		    
		    if(sdkz != Sd){
		    	System.out.println("当前深度："+sdkz+"-即将继续下一深度");
				try {
					Robot r = new Robot();
			        r.delay(2000);//延时2秒继续，防止卡死
				} catch (AWTException e) {
					System.err.println("延时器错误");
				}
		    }else{
		    	System.out.println("当前深度："+sdkz+"-五秒后结束进程");
		    	try {
					Robot r = new Robot();
			        r.delay(5000);//延时2秒继续，防止卡死
				} catch (AWTException e) {
					System.err.println("延时器错误");
				}
		    }
	    }
	    
	}
}

工具类，用来下载指定图片到指定路径

package com.Pic_Download;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;

import java.net.URL;

public class Pic_Download {
	
	public static void Pic_DownLoad(String imageUrl,String newImageName){
		if(!(imageUrl.equals("")||imageUrl == null)){
			try{
				long startime = System.currentTimeMillis();
				URL url = new URL(imageUrl);
				//打开网络输入流
				DataInputStream dis = new DataInputStream(url.openStream());
				if(newImageName.equals("")||newImageName == null){//判断存放路径非空
					newImageName="D://pictures/1.jpg";
				}
				//建立一个新的文件
				FileOutputStream fos = new FileOutputStream(new File(newImageName));
				byte[] buffer = new byte[1024];
				int length;
				//开始填充数据
				while((length = dis.read(buffer))>0){
					fos.write(buffer,0,length);
				}
				
				dis.close();
				fos.close();
				long endtime = System.currentTimeMillis();
				System.out.println("耗时("+(endtime - startime)+"ms)下载完毕："+imageUrl);
			}catch(IOException e){
				System.err.println("流错误");
			}
		}else{
			System.out.println("网络路径不能为空");
		}
	}
}

工具类，用来获取网页源码

package com.Pic_Download;


import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
 * @author winddack
 *
 */
public class Url_Download {
  private String pageUrl;//定义需要操作的网页地址
  private String pageEncode="UTF8";//定义需要操作的网页的编码
  public String getPageUrl() {
    return pageUrl;
  }
  public void setPageUrl(String pageUrl) {
    this.pageUrl = pageUrl;
  }
  public String getPageEncode() {
    return pageEncode;
  }
  public void setPageEncode(String pageEncode) {
    this.pageEncode = pageEncode;
  }
  //定义取源码的方法
  public String getPageSource()
  {
    StringBuffer sb = new StringBuffer();
    try {
      //构建一URL对象
      URL url = new URL(pageUrl);
      //使用openStream得到一输入流并由此构造一个BufferedReader对象
      BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream(), pageEncode));
      String line;
      //读取www资源
      while ((line = in.readLine()) != null)
      {
        sb.append(line);
      }
      in.close();
    }
    catch (Exception ex)
    {
      System.err.println(ex);
    }
    return sb.toString();
  }
  
  //定义一个把HTML标签删除过的源码的方法
  public String getPageSourceWithoutHtml()
  {
    final String regEx_script = "<script[^>]*?>[\\s\\S]*?<\\/script>"; // 定义script的正则表达式
    final String regEx_style = "<style[^>]*?>[\\s\\S]*?<\\/style>"; // 定义style的正则表达式
    final String regEx_html = "<[^>]+>"; // 定义HTML标签的正则表达式
    final String regEx_space = "\\s*|\t|\r|\n";//定义空格回车换行符
    String htmlStr = getPageSource();//获取未处理过的源码
    Pattern p_script = Pattern.compile(regEx_script, Pattern.CASE_INSENSITIVE);
    Matcher m_script = p_script.matcher(htmlStr);
    htmlStr = m_script.replaceAll(""); // 过滤script标签
    Pattern p_style = Pattern.compile(regEx_style, Pattern.CASE_INSENSITIVE);
    Matcher m_style = p_style.matcher(htmlStr);
    htmlStr = m_style.replaceAll(""); // 过滤style标签
    Pattern p_html = Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE);
    Matcher m_html = p_html.matcher(htmlStr);
    htmlStr = m_html.replaceAll(""); // 过滤html标签
    Pattern p_space = Pattern.compile(regEx_space, Pattern.CASE_INSENSITIVE);
    Matcher m_space = p_space.matcher(htmlStr);
    htmlStr = m_space.replaceAll(""); // 过滤空格回车标签
    htmlStr = htmlStr.trim(); // 返回文本字符串
    htmlStr = htmlStr.replaceAll(" ", "");
    htmlStr = htmlStr.substring(0, htmlStr.indexOf("。")+1);
    return htmlStr;
  }
}

工具类，解析网页并找出其中的图片链接

package com.Pic_Download;



import java.util.regex.Matcher;
import java.util.regex.Pattern;



class DownLoad implements Runnable {
	//这个是pic下载线程
	private Thread t;
	private String threadName;
	private String Url;
	DownLoad( String name,String Url) {
		threadName = name;
		this.Url = Url;
	}
	public void run(){
		Pic_Download.Pic_DownLoad(Url, "D:/picture/"+Url_Pic_Download.filename+"/"+threadName+".jpg");
   	}
	public void start () {
		if (t == null) {
			t = new Thread (this, threadName);
	        t.start ();
	    }
	}
}


public class Url_Pic_Download {
	public static String filename = "";
	
	
	
	/**
	 * @param Url 定义目标网址，自动下载内部图片至D:/picture/
	 */
	public static void downloadimg(String Url){
		if(! ( Url.equals("") || Url == null)){//判断Url是否为空
			Url_Download a = new Url_Download();
			a.setPageUrl(Url);
			String text = a.getPageSource();
			
			String regex="[a-zA-z]+://[^\\s]*\\.+(jpg|png|img|gif|jpeg)";
			
		    Pattern pt=Pattern.compile(regex);
		    Matcher mt=pt.matcher(text);
		    
		    for(int i = 0;mt.find();i++){
		    	String url = mt.group();
		    	System.out.println(url);
		    	DownLoad xz = new DownLoad(i+"",url);
		    	xz.start();
		    }
		}
	}
	
	

}