使用java做最简单的爬虫，爬取图片

最新推荐文章于 2025-07-11 11:53:51 发布

飞鸟无痕丶

最新推荐文章于 2025-07-11 11:53:51 发布

阅读量2.1k

点赞数 3

CC 4.0 BY-SA版权

分类专栏： java

本文链接：https://blog.youkuaiyun.com/qq_42773146/article/details/81178870

java 专栏收录该内容

25 篇文章

订阅专栏

本文介绍了一个简单的网页图片爬虫实现过程，包括使用Java和jQuery从指定网页抓取并解析图片资源，最后下载到本地的方法。

第一步：index.jsp页面输入要爬取的网址

第二步：目标result.jsp页面使用java（Downimg.java）和jquery爬取网址源码，并解析出所有img

java用来创建输入输出流

package com.pc.util;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;

public class Downimg {

   //首先我们要抓取图片
   //1.找到图片存放的位置
   //2.获取网页源代码
   /*
   * @Description:获取网页源代码
   * @Title:获取网页源代码
   * @Param_link:网页网址
   * @Param_encoding：编码
   *
   * */


   //下载函数
   public static boolean download(String netImg,String Path){

       try {
           //建立图片的网络链接
       URL url=new URL(netImg);
       //打开链接

   URLConnection urlConnection=url.openConnection();
       //创建一个输入流存储，将目标流打入文件
       InputStream inputStream=urlConnection.getInputStream();
       File file=new File(Path);

       //创建一个输出流，将文件流输出
       FileOutputStream outputStream=new FileOutputStream(file);

       int i=0;
       while((i=inputStream.read())!=-1) {
           outputStream.write(i);
       }

       //关闭流,先开后关
       outputStream.close();
       inputStream.close();
       return true;

       } catch (Exception e) {
           // TODO Auto-generated catch block
           e.printStackTrace();
           return false;
       }
   }


   public static String htmlSource(String link,String encoding) {
       //因为作用域问题，所以提取出来
       InputStreamReader in=null;
       //因后面需要长久的存储，故创建一个sb
       StringBuffer sBuffer=new StringBuffer();
//建立网络连接
       try {
//异常捕获，提前处理，使用try catch
           URL url=new URL(link);
//打开链接
           URLConnection uConnection=url.openConnection();
//模拟浏览器登录
           uConnection.setRequestProperty("User-Agent", "java");
//文件的传输 IO流
           InputStream inputStream=uConnection.getInputStream();
           in=new InputStreamReader(inputStream,encoding);
       //下载源代码
           //创建一个临时文件
           String line=null;
           //通过while循环进行逐行读取
           //因为 in只能一次读一个，所以用bufferreader

           BufferedReader reader=new BufferedReader(in);
           while((line=reader.readLine())!=null) {
               //System.out.println(line);
               //因为line是临时数据不可能长久保存，故顶一个SBuffer
               sBuffer.append(line+"\n");
           }
       } catch (Exception e) {
           // TODO Auto-generated catch block
           e.printStackTrace();
       }//因为创建了流，所以需要创建finall去关闭
       finally {
           try {
               if(null!=in) {
                   in.close();
               }
           } catch (Exception e2) {
               // TODO: handle exception
           }
       }


       return sBuffer.toString();
   }

   public static void main(String[] args) {
       System.out.println("Java里最牛逼的方法");
       download("http://pics.sc.chinaz.com/files/pic/pic9/201804/zzpic11491.jpg", "D:/VS/1.jpg");
   }
}

jsp中使用jquery解析img

<%@page import="com.pc.util.Downimg"%>
<%@ page language="java" contentType="text/html; charset=utf-8"
pageEncoding="utf-8"%>
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<title>Insert title here</title>
<script type="text/javascript" src="http://code.jquery.com/jquery-latest.js"></script>
</head>
<%
//获取url
String url=request.getParameter("url");
String htmlSource=Downimg.htmlSource(url, "gbk");
pageContext.setAttribute("htmlSource", htmlSource);
System.out.print(url);
%>
<body>
<div>
<h1>你获取的数据</h1><br>
<textarea id="source" style="height:400px;width:1000px;overflow=auto;">${htmlSource}</textarea>
</div><br>
<form action="download.jsp" method="post" id="formf">
<label>符合条件的图片有：</label>

<input type="submit" value="下载图片">
</form>
<br>

第三步：创建下载调用页面，调用之前java代码中的下载逻辑把缓存的img通过流方式下载，download.jsp

<%@page import="com.pc.util.Downimg"%>
<%@page import="java.io.File"%>
<%@ page language="java" contentType="text/html; charset=utf-8"
pageEncoding="utf-8"%>
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<title>Insert title here</title>
<script type="text/javascript" src="http://code.jquery.com/jquery-latest.js"></script>
</head>

<body>
<%

String[] imgs=request.getParameterValues("test");
String imgpath="d:/Test/downPC";
System.out.println(imgpath);
System.out.println(imgs[0].toString());

File file=new File(imgpath);
//如果没有这个文件夹，就创建一个
if(!file.exists()){
   file.mkdirs();
}
//遍历下载
for(String img:imgs){
   System.out.println(img);
   String filename=img.substring(img.lastIndexOf("/")+1,img.length());
System.out.println(filename);
   Downimg.download(img,imgpath+"/"+filename);
   out.print("<img src='down/'"+filename+">");
}
%>

</body>
</html>