java 抓取网页图片

import  java.io.File;
import  java.io.FileOutputStream;
import  java.io.InputStream;
import  java.io.OutputStream;
import  java.net.HttpURLConnection;
import  java.net.URL;
import  java.net.URLConnection;
import  java.text.SimpleDateFormat;
import  java.util.ArrayList;
import  java.util.Iterator;
import  java.util.List;
import  java.util.UUID;
import  java.util.regex.Matcher;
import  java.util.regex.Pattern;
 
/***
  * java抓取网络图片
  *
  * @author ITWANG
  *
  */
public  class  CatchImage
{
 
     // 地址
     private  static  final  String URL = "http://www.4493.com/" ;
     // 编码
     private  static  final  String ECODING = "UTF-8" ;
     // 获取img标签正则
     private  static  final  String IMGURL_REG = "<img.*src=(.*?)[^>]*?>" ;
     // 获取src路径的正则
     private  static  final  String IMGSRC_REG = "http:\"?(.*?)(\"|>|\\s+)" ;
 
     private  static  final  String IMGDSRC_REG = "[\"\'](http.+\\.(jpg|JPG|png|PNG|gif|GIF))[\"\']" ;
     private  static  final  String[] picstuffix = { "jpg" , "JPG" , "gif" , "GIF" , "png" , "PNG"  };
 
     private  static  List<String> pList = new  ArrayList<>();
 
     public  static  void  main(String[] args) throws  Exception
     {
         CatchImage cm = new  CatchImage();
         // 获得html文本内容
         String HTML = cm.getHTML(URL);
         // System.out.println(HTML);
         // 获取图片标签
         List<String> imgUrl = cm.getImageUrl(HTML);
          // 获取图片src地址
          List<String> imgSrc = cm.getImageSrc(imgUrl);
          // 下载图片
          cm.Download(imgSrc, "E:\\Imagesave" +saveDiff());
         
//      cm.getImageSrc(HTML);
         // cm.ThreadDownload(imgSrc, "E:\\Imagesave"+saveDiff() , 6);
//      cm.TOThreadDownload(pList, "E:\\Imagesave" + saveDiff(), 6, 6000);
     }
 
     /***
      * 获取HTML内容
      *
      * @param url
      * @return
      * @throws Exception
      */
     private  String getHTML(String url) throws  Exception
     {
         URL uri = new  URL(url);
         URLConnection connection = uri.openConnection();
         InputStream in = connection.getInputStream();
         byte [] buf = new  byte [ 1024 ];
         int  length = 0 ;
         StringBuffer sb = new  StringBuffer();
         while  ((length = in.read(buf, 0 , buf.length)) > 0 )
         {
             sb.append( new  String(buf, ECODING));
         }
         in.close();
         return  sb.toString();
     }
 
     /***
      * 获取ImageUrl地址
      *
      * @param HTML
      * @return
      */
     private  List<String> getImageUrl(String HTML)
     {
         Matcher matcher = Pattern.compile(IMGURL_REG).matcher(HTML);
         List<String> listImgUrl = new  ArrayList<String>();
         while  (matcher.find())
         {
             listImgUrl.add(matcher.group());
         }
         return  listImgUrl;
     }
 
     /***
      * 获取ImageSrc地址
      *
      * @param listImageUrl
      * @return
      */
     private  List<String> getImageSrc(List<String> listImageUrl)
     {
         List<String> listImgSrc = new  ArrayList<String>();
         for  (String image : listImageUrl)
         {
             Matcher matcher = Pattern.compile(IMGSRC_REG).matcher(image);
             while  (matcher.find())
             {
                 listImgSrc.add(matcher.group().substring( 0 , matcher.group().length() - 1 ));
             }
         }
         return  listImgSrc;
     }
 
     /**
      * 获取html里面的图片
      *
      * @param html
      * @return
      */
     private  void  getImageSrc(String html)
     {
         Pattern p = Pattern.compile(IMGDSRC_REG);
         Matcher m = p.matcher(html);
         while  (m.find())
         {
             getpicsrc(m.group( 1 ));
         }
     }
 
     /**
      * 截取字符串里面的图片
      *
      * @param src
      * @return
      */
     public  void  getpicsrc(String src)
     {
         if  (src.contains( "http:" ))
         {
             String[] app = src.split( "http:" );
             for  ( int  i = 0 ; i < app.length; i++)
             {
                 if  (!isBlank(app[i]))
                 {
                     for  ( int  j = 0 ; j < picstuffix.length; j++)
                     {
                         if  (app[i].contains( "."  + picstuffix[j]))
                         {
                             int  inum = app[i].indexOf(picstuffix[j]);
                             String url = "http:"  + app[i].substring( 0 , inum) + picstuffix[j];
                             pList.add(url);
                         }
                     }
                 }
             }
         }
     }
 
     /**
      * 去处重复元素
      *
      * @param result
      * @return
      */
     public  static  List<String> RemoveRepeated(List<String> result)
     {
         List<String> tmpArr = new  ArrayList<String>();
         for  ( int  i = 0 ; i < result.size(); i++)
         {
             if  (!tmpArr.contains(result.get(i)))
             {
                 tmpArr.add((String) result.get(i));
             }
         }
         return  tmpArr;
     }
 
     /**
      * 判断非空
      *
      * @param cs
      * @return
      */
     public  static  boolean  isBlank(CharSequence cs)
     {
         int  strLen;
         if  (cs == null  || (strLen = cs.length()) == 0 )
         {
             return  true ;
         }
         for  ( int  i = 0 ; i < strLen; i++)
         {
             if  (Character.isWhitespace(cs.charAt(i)) == false )
             {
                 return  false ;
             }
         }
         return  true ;
     }
 
     /***
      * 单线程下载图片
      *
      * @param listImgSrc
      */
     private  void  Download(List<String> listImgSrc, String savedir)
     {
         for  (String url : listImgSrc)
         {
             try
             {
                 String imageName = url.substring(url.lastIndexOf( "/" ) + 1 , url.length());
                 URL uri = new  URL(url);
                 InputStream in = uri.openStream();
                 FileOutputStream fo = new  FileOutputStream( new  File(savedir + imageName));
                 byte [] buf = new  byte [ 1024 ];
                 int  length = 0 ;
                 while  ((length = in.read(buf, 0 , buf.length)) != - 1 )
                 {
                     fo.write(buf, 0 , length);
                 }
                 in.close();
                 fo.close();
                 System.out.println( "*^_^*" );
             } catch  (Exception e)
             {
                 System.out.println( "-_-!" );
             }
         }
     }
 
     /**
      * 多线程下载图片
      *
      * @param listImgSrc
      * @param savedir
      * @param tnum
      */
     private  void  ThreadDownload(List<String> listImgSrc, String savedir, int  tnum)
     {
         for  ( int  i = 0 ; i < listImgSrc.size(); i += tnum)
         {
             new  DThread(savedir, tnum, listImgSrc, i).start();
         }
     }
 
     /**
      *
      * 2014-4-3上午10:52:38 Describe: 多线程下载照片
      *
      * @author: ITWANG
      */
     class  DThread extends  Thread
     {
 
         private  String savedir = null ;
         private  int  tnum;
         private  List<String> listImgSrc;
         private  int  bunm;
 
         public  DThread(String savedir, int  tnum, List<String> listImgSrc, int  bnum)
         {
             this .savedir = savedir;
             this .tnum = tnum;
             this .listImgSrc = listImgSrc;
             this .bunm = bnum;
         }
 
         @Override
         public  void  run()
         {
             for  ( int  i = 0 ; i < tnum; i++)
             {
                 try
                 {
                     String url = listImgSrc.get(bunm + i);
                     String sps = url.substring(url.lastIndexOf( "." ), url.length());
                     String imageName = UUID.randomUUID().toString() + sps;
                     URL uri = new  URL(url);
                     InputStream in = uri.openStream();
                     System.out.println(savedir + imageName);
                     FileOutputStream fo = new  FileOutputStream( new  File(savedir + imageName));
                     byte [] buf = new  byte [ 1024 ];
                     int  length = 0 ;
                     while  ((length = in.read(buf, 0 , buf.length)) != - 1 )
                     {
                         fo.write(buf, 0 , length);
                     }
                     in.close();
                     fo.close();
                     System.out.println( "*^_^*" );
                 } catch  (Exception e)
                 {
                     System.out.println( "-_-!" );
                 }
             }
         }
     }
 
     /**
      * 多线程超时下载
      *
      * @param listImgSrc
      * @param savedir
      * @param tnum
      * @param timeout
      */
     private  void  TOThreadDownload(List<String> listImgSrc, String savedir, int  tnum, int  timeout)
     {
         for  ( int  i = 0 ; i < listImgSrc.size(); i += tnum)
         {
             new  TODThread(savedir, tnum, listImgSrc, i, timeout).start();
         }
     }
 
     /**
      *
      * 2014-4-3上午10:52:07 Describe: 超时方式下载照片线程
      *
      * @author: ITWANG
      */
     class  TODThread extends  Thread
     {
         private  String savedir = null ;
         private  int  tnum;
         private  List<String> listImgSrc;
         private  int  bunm;
         private  int  timeout = 3000 ;
 
         public  TODThread(String savedir, int  tnum, List<String> listImgSrc, int  bnum, int  timeout)
         {
             this .savedir = savedir;
             this .tnum = tnum;
             this .listImgSrc = listImgSrc;
             this .bunm = bnum;
             this .timeout = timeout;
         }
 
         @Override
         public  void  run()
         {
             for  ( int  i = 0 ; i < tnum; i++)
             {
                 String url = listImgSrc.get(bunm + i);
                 String sps = url.substring(url.lastIndexOf( "." ), url.length());
                 String imageName = UUID.randomUUID().toString() + sps;
                 try
                 {
                     if  (getPic(url, savedir, imageName, timeout))
                     {
                         System.out.println( "*^_^*" );
                     } else
                     {
                         System.out.println( "-_-!" );
                     }
                 } catch  (Exception e)
                 {
                     System.out.println( "下载异常" );
                 }
             }
 
         }
     }
 
     /**
      * GET方式下载照片
      *
      * @param purl
      * @param folder
      * @param filename
      * @param timeout
      * @return
      * @throws Exception
      */
     public  boolean  getPic(String purl, String folder, String filename, int  timeout) throws  Exception
     {
         URL url = new  URL(purl);
         HttpURLConnection conn = (HttpURLConnection) url.openConnection();
         conn.setConnectTimeout(timeout);
         conn.setRequestMethod( "GET" );
         conn.setDoOutput( true );
         conn.setDoInput( true );
         if  (conn.getResponseCode() == 200 )
         {
             InputStream is = conn.getInputStream();
             byte [] bs = new  byte [ 1024 ];
             int  len;
             File sf = new  File(folder);
             if  (!sf.exists())
             {
                 sf.mkdirs();
             }
             OutputStream os = new  FileOutputStream(sf.getPath() + "\\"  + filename);
             while  ((len = is.read(bs)) != - 1 )
             {
                 os.write(bs, 0 , len);
             }
             os.close();
             is.close();
             return  true ;
         }
         return  false ;
     }
 
     /**
      * 时间文件夹
      *
      * @return
      */
     public  static  String saveDiff()
     {
         SimpleDateFormat formate = new  SimpleDateFormat( "yyyy-MM-dd-HH-mm-ss" );
         return  "\\"  + formate.format(System.currentTimeMillis()) + "\\" ;
     }
 
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值