java 写的一个蜘蛛程序

最新推荐文章于 2023-05-31 17:33:18 发布

疯狂拇指

最新推荐文章于 2023-05-31 17:33:18 发布

阅读量2k

点赞数 1

CC 4.0 BY-SA版权

分类专栏： JSP JAVA 文章标签： java string exception url null regex

本文链接：https://blog.youkuaiyun.com/weiqingli190949353/article/details/3950301

JSP JAVA 专栏收录该内容

33 篇文章

订阅专栏

package com.qili.spider;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.Timer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Spider implements Runnable{
/**
*
* 日期 2009-3-2
* 蜘蛛 Spider
* 作者: 韦庆礼
* QQ 826785300
*
*/
static ArrayList<String> disallowListCache=new ArrayList<String>(); //缓存不被搜索的url robotx协议
static ArrayList<String> allUrls=new ArrayList<String>(); //存放所有的URL
static ArrayList<String> hasSearchUrls=new ArrayList<String>();     //存放己经被搜索过的Url 临时变量
static ArrayList<String> roallurls=new ArrayList<String>();     //保存第一次输出的url       临时变量
static ArrayList<String> roFilter=new ArrayList<String>();      //保存期第二次过滤后输出的url
static ArrayList<String> Allhost=new ArrayList<String>();        //储放到过的主机
static ArrayList<String> AllType=new ArrayList<String>();      //存放要过滤的图片音频文件等类型
static int MaxdisallowListCache=5000;             //缓存不被搜索的url robotx协议的条数默认是5000
static String checkChar=null;      //默认是不启动搜索相关字符串搜索的值为null
static String webUrl="http://www.hao123.com/";     //初始 url 形式是 http://.*/
static int maxURLCheng=10;     //默认搜索层数在10内
static int UrlWeight=10;       //搜索权重在10以上的url
static boolean isHost=false;     //默认不限定在主机内false
static boolean robots_txt=false;     //默认是不检查 robots_txt协议false
static int NmaxUrl=50000;         //默认搜索到url数是50000000 到这个数搜索结束
static int NThread=1;        //默认为5个线程
static int timeout=10000;          //设置超时时间单位是豪秒

static boolean isWriteLog=true;     //默认是存储系统日志的true
static String LogoFileName=null;          //生成本次蜘蛛运行的日志文件
static String Log=null;              //系统日志信息的全局变量
static double AllPagesize=0;              //全部网页HTML源文件的大小
static double NwriteLogSum=200;     //用于控制这个系统日志写的差值当达到这个差值才写入单位是 KB 1024
static double NtempLog=0;            //用于计算累加下载的值
/** *
   * 输入一网址
    * 读取一个网页全部内容
    * 另一种获取的方式
    */
//   public static String getHtml(String webUrl){
//      try {
//      System.out.println();
//    System.out.println("    正在获取 "+webUrl+" HTML内容");
//      URL pageUrl = new URL(webUrl);
//      try{
//         String host=pageUrl.getHost();
//         if(!Allhost.contains(host)&&!host.equals("")){
//         Allhost.add(host);
//         //写系统日志
//         Log="搜索到此主机的相关页面 Host: "+host;
//         WriteLogo(LogoFileName,Log,isWriteLog);
//         }
//      }catch(Exception ex){
//        //写系统日志
//    Log=" 无法获取主机: "+webUrl+" "+ex.toString();
//    WriteLogo(LogoFileName,Log,isWriteLog);
//      }
//             BufferedReader reader =
//               new BufferedReader(new InputStreamReader(pageUrl.openStream()));
//             String line;
//             StringBuffer pageBuffer = new StringBuffer();
//             while ((line = reader.readLine()) != null) {
//               pageBuffer.append(line);
//             }
//             double temPageSize=(double)pageBuffer.length();
//             TotalDownLoadPagesize(temPageSize);       //调用方法保存下载总大小信息
//             return pageBuffer.toString();
//
//          } catch (Exception e) {
//         System.out.println("     Fail");
//      System.out.println( e.toString());
//      //写系统日志
//    Log=" 无法获取 HTML内容: "+webUrl;
//    WriteLogo(LogoFileName,Log,isWriteLog);
//          }
//          return null;
//    }

//*************************************************************
/**
* 用来输出等待时间
*/

   static class MyTask extends java.util.TimerTask{
    int num=0;
         public void run() {
             num++;
             System.out.println(" 己花费   "+num+" S");

         }
     }
    public static String getHtml(String webUrl) throws IOException{
     System.out.println();
     Timer timer = new Timer();
   StringBuffer sb = new StringBuffer("");
     try{
       URL urlmy = new URL(webUrl);
       try{
        String host=urlmy.getHost();
        if(!Allhost.contains(host)&&!host.equals("")){
        Allhost.add(host);
        //写系统日志
        Log="搜索到此主机的相关页面 Host: "+host;
        WriteLogo(LogoFileName,Log,isWriteLog);
        }
     }catch(Exception ex){
       //写系统日志
      Log=" 无法获取主机: "+webUrl+" "+ex.toString();
      WriteLogo(LogoFileName,Log,isWriteLog);
     }
         HttpURLConnection con = (HttpURLConnection) urlmy.openConnection();
         HttpURLConnection.setFollowRedirects(true);
         con.setInstanceFollowRedirects(false);
         con.setConnectTimeout(timeout);            //设置超时，这里直接调用系统设置的
         con.setReadTimeout(timeout);
         con.connect();
         System.out.println("");
         System.out.println("正在连接 ...");
         System.out.println("地址是    "+webUrl);
      timer.schedule(new MyTask(), 1, 1000); //在1毫秒后执行此任务,每次间隔1秒
         String contentType=con.getContentType();
         System.out.println("方式   "+contentType);
         System.out.println("状态 "+con.getResponseMessage());
         System.out.println();
         String s = "";
         System.out.println("正在读取HTML流 ...");
          BufferedReader br=new BufferedReader(new InputStreamReader(con.getInputStream()));
            while ((s = br.readLine()) != null) {
              sb.append(s);
            }
             double temPageSize=(double)sb.length();
             TotalDownLoadPagesize(temPageSize);       //调用方法保存下载总大小信息
         timer.cancel();   //取消
            return sb.toString();

     }catch(Exception ex){
     System.out.println("     Fail");
     System.out.println( ex.toString());
     //写系统日志
   Log=" 无法获取 HTML内容: "+webUrl;
   WriteLogo(LogoFileName,Log,isWriteLog);
          }
     finally{
       timer.cancel();   //取消
     }
     return null;
     }

   /**
    * 计算并将下载总页面大小写入到系统日志
    * @param temPageSize
    */
    public static void TotalDownLoadPagesize(double temPageSize){
    NtempLog+=temPageSize;       //累加
         AllPagesize+=temPageSize;    //累加
         System.out.println(" OK "+(temPageSize/1024)+" KB");
         if((NtempLog/1024)>=NwriteLogSum){   //达到计数条件
         NtempLog=0;                               //重置计数器
         if(AllPagesize<1048576){
         String temLog="have download PageSize is: "+AllPagesize/1024+" KB/r/n";
         temLog+=" Has Search URL is:"+hasSearchUrls.size();
         temLog+=" All URLs is:"+allUrls.size()+"/r/n";
         Log=temLog;
         }
             if(AllPagesize>=1048576&&AllPagesize<1073741824){
         double end=0;
            end=AllPagesize/1048576;
            String temLog="have download PageSize is: "+end+" MB/r/n";
            temLog+=" Has Search URL is:"+hasSearchUrls.size();
         temLog+=" All URLs is:"+allUrls.size()+"/r/n";
         Log=temLog;
            }
             WriteLogo(LogoFileName,Log,isWriteLog);
         }

    }

/**
* String HtmlCount
*
*
* @param 输入一个网址
* @param
* @return 输出url 的集合(没有被帅选)
* @throws IOException
*/
public static ArrayList<String> getUrls(String webUrl){
   ArrayList<String> arrlist=new ArrayList<String>();   //存放输出的URL
   try
   {
    String HtmlCount=getHtml(webUrl);          //掉用getHtml方法获到HTML文本内容******************************************
    System.out.println();
    System.out.println("    正在分析 URL: "+webUrl);
    if(HtmlCount!=null){
     try{
      String regex="<a href.*</a>";
      String output=null;                                 //第一次提取的<a href * </a>
      String outputUrl=null;                            //对第一次提取的Url再次过滤
        Pattern pa=Pattern.compile(regex, Pattern.DOTALL);
        Matcher ma=pa.matcher(HtmlCount);                             //初次过滤
        while(ma.find()){
         output=ma.group().trim();
         regex="<a//s+href//s*=//s*/"?(.*?)[/"|>]";             //再次过滤
         // regex="http//s*/"?(.*?)[/"|>]";
         Pattern paa=Pattern.compile(regex,Pattern.DOTALL);
         Matcher maa=paa.matcher(output);
         while(maa.find()){
          outputUrl=maa.group().trim();
         // System.out.println();
         // System.out.println(outputUrl);
               if (outputUrl.length() < 1) {
          continue;
         }
         // 跳过链到本页面内链接。
         if (outputUrl.charAt(0) == '#') {
          continue;
         }
         if (outputUrl.indexOf("mailto:") != -1) {
          continue;
         }
         if (outputUrl.toLowerCase().indexOf("javascript") != -1) {
          continue;
         }
         if(outputUrl.toLowerCase().indexOf("lotteryData")!=-1){
          continue;
         }
         //********************************************
         //根据实际情况过渡网页的某些垃圾信息
         outputUrl=outputUrl.replace("<a href=", "");
         outputUrl=outputUrl.replace("/"", "");
         outputUrl=outputUrl.replace(">", "");
         outputUrl=outputUrl.replace("/>", "");
         outputUrl=outputUrl.replace("class=","");
         outputUrl=outputUrl.replace("target=_blank", "");
         outputUrl=outputUrl.replace("'target=_blank'", "");
         outputUrl=outputUrl.replace("'_blank'", "");
         outputUrl=outputUrl.replace("target=_self", "");
         outputUrl=outputUrl.replace("target=", "");
         outputUrl=outputUrl.replace("style=", "");
         outputUrl=outputUrl.replace("../", "");
         outputUrl=outputUrl.replace("#", "");
         outputUrl=outputUrl.replace("title=", "");
         outputUrl=outputUrl.replace("<a href=", "");
         outputUrl=outputUrl.replace("'", "");
         //*********************
         int endUrl=outputUrl.indexOf(" ");            //处理url有空格问题
         if(endUrl>0){                              //要是有空格才截断
          outputUrl=outputUrl.substring(0,endUrl); //以空格为结束
         }
         //*************************
         outputUrl=outputUrl.trim();    //过滤空间
             if(outputUrl.length()>0){
            if(outputUrl.indexOf("://")==-1){    //处理相对地址
             int length=webUrl.length();
             int find=webUrl.lastIndexOf("/")+1;
             if(length==find){
            outputUrl=webUrl+outputUrl;           // 如果以/结尾
             }else{
            outputUrl=webUrl+"/"+outputUrl;     // 如果不以 /结尾
             }
            }
            int begin2=outputUrl.lastIndexOf("//")+2; //检查是否是以 //结尾
            if(begin2==outputUrl.length()){
           outputUrl=outputUrl.substring(0, outputUrl.length()-1); // 把   //变成/
            }
            if(!arrlist.contains(outputUrl)){               //去掉重复的url
           arrlist.add(outputUrl);
            }

         }
        }
     }
     }catch(Exception ex){
      System.out.println("    Fail");
      System.out.println(ex.toString());
    //写系统日志
      Log="分析 "+webUrl+" 的连接时出错 "+ex.toString();
      WriteLogo(LogoFileName,Log,isWriteLog);
     }
    }
   }catch(Exception ex){
    System.out.println("    Fail");
    System.out.println(ex.toString());
  //写系统日志
    Log="分析 "+webUrl+" 的连接时出错 "+ex.toString();
   WriteLogo(LogoFileName,Log,isWriteLog);
   }
System.out.println(" OK");
return arrlist;

}

/**
*
*
* @param allurls    传过来的url的集合
* @param webUrl     伟过来的url的集合是由这个网址产生的
* @param maxURLCheng    指定要过虑的url的层数
* @param isHost          指定要过虑的url的权重
* @param robots_txt    是否是要进行robots.txt协议的过滤
* @param MaxdisallowListCache 系统执行robotx.txt协议过虑的系统的缓存的路径数目
* @return 返回过滤后的url的集合
*/
public static ArrayList<String> filterURL(ArrayList<String> allurls,String roUrl,int maxURLCheng,
   int UrlWeight,boolean isHost,boolean robots_txt,int MaxdisallowListCache){
  ArrayList<String> endFilter=new ArrayList<String>();
  ArrayList<String> allURL=new ArrayList<String>(); //接收保存   getUrls传过来的值
  String host=null;
  try{
   System.out.println();
   System.out.println("      正在过滤符合条件的URL");
   allURL.clear();
   allURL=allurls;        //接收保存   getUrls传过来的值********************************
   if(allURL.size()>0){           //如果不为空
    for(int i=0;i<allURL.size();i++){
     //过滤jpg gif等格式
     String tmepUrl=allURL.get(i).toString();

     if(isHost){               //限定主机 isHost默认是 false
      boolean isHostUrl=false;
      try{
       URL hostURL=new URL(tmepUrl);
       host= hostURL.getHost().toLowerCase(); // 获取给出RUL的主机
      }catch(Exception em){
       //写系统日志
        Log="无法获取 "+tmepUrl+" 主机 "+em.toString();
        WriteLogo(LogoFileName,Log,isWriteLog);
       host=null;
      }
      if(host!=null){         //要是获取主机失败,则不执行
       int dou=0;
       String newurl=null;
       newurl=webUrl;
       if(newurl.indexOf("www.")!=-1){        //即是 XX.com.cn 这种形式
        dou=newurl.indexOf(".")+1;
        newurl=newurl.substring(dou);
        newurl=newurl.replace("/", "");
        if(host.indexOf(newurl)!=-1){
         isHostUrl=true;               //是主机内的url
        }
       }else{                             //即是 http://news.XX.com/ 这种形式
        dou=newurl.indexOf("//")+1;
        newurl=newurl.substring(dou);
        newurl=newurl.replace("/", "");
        if(host.indexOf(newurl)!=-1){
         isHostUrl=true;                     //是主机内的url
        }
       }
       if(isHostUrl){       //符合主机内的
        int urlCheng=getUrlCheng(tmepUrl);           //取得url层数
        int urlWeight=getUrlWeight(tmepUrl);         //取得url权重
//        System.out.println(tmepUrl+"    层数是 "+urlCheng+" 权重是"+urlWeight);
        if(urlCheng<=maxURLCheng&&urlWeight>=UrlWeight){
         if(robots_txt){
          boolean urlresult=isRobotAllowed(tmepUrl,MaxdisallowListCache);
          if(urlresult){
           endFilter.add(tmepUrl);            //符合条件的
          }else{
           System.out.println("      此url "+tmepUrl+"   受robots.txt协议限制");
          }
         }else{        //不进行robots.txt过滤
          endFilter.add(tmepUrl);            //符合条件的
         }
        }else{
         System.out.println(" 受层数限制或是权重太小"+" 层数 "+urlCheng+" 权重 "+urlWeight);
        }
       }else{
        System.out.println("     不在主机内 ULR: "+tmepUrl);
       }
        }
     }else{         //不限定主机
      int urlCheng=getUrlCheng(tmepUrl);           //取得url层数
      int urlWeight=getUrlWeight(tmepUrl);         //取得url权重
//      System.out.println(tmepUrl+"    层数是 "+urlCheng+" 权重是 "+urlWeight);
      if(urlCheng<=maxURLCheng&&urlWeight>=UrlWeight){   //要同时符合层数和权重
       if(robots_txt){          //如果执行robots.txt检查
        boolean urlresult=isRobotAllowed(tmepUrl,MaxdisallowListCache);
        if(urlresult){
         endFilter.add(tmepUrl);            //符合条件的
        }else{
         System.out.println("      受robots.txt协议限制: "+tmepUrl);
        }
       }else{        //不进行robots.txt过滤
        endFilter.add(tmepUrl);            //符合条件的
       }
      }else{
       System.out.println(" 受层数限制或是权重太小"+" 层数 "+urlCheng+" 权重 "+urlWeight+" URL: "+tmepUrl);
      }
     }
    }
   }
  }catch(Exception ex){
   System.out.println("    Fail");
   System.out.println("    "+ex.toString());
   //写系统日志
    Log="过滤符合条件的url 时异常 "+ex.toString();
    WriteLogo(LogoFileName,Log,isWriteLog);
  }
  System.out.println("    OK");
  return endFilter;
}

/**
*
* 输入Url
* @return url的权重
* @throws IOException
*/
public static int getUrlWeight(String url)
{
    int weight=20;             // 初始网址的权重为20
    int sum=0;
    try{
     System.out.println();
     System.out.println("   正在计算权重 URL: "+url);
     if(url.length()>0){
      //*******************************计算网址层数
      for(int i=0;i<url.length();i++){
       char ch=url.charAt(i);
       int Nch=(int)ch;
       if(Nch==47){
        sum++;
       }
      }
       int lastCh=url.lastIndexOf("/")+1;
       if(url.indexOf("://")!=-1){
      sum=sum-2;
       }
       if(lastCh==url.length()){
        sum=sum-1;
       }
       if(url.indexOf("//")!=url.lastIndexOf("//")){
        sum=sum-1;
       }
     //*******************************计算网址层数

       for(int i=0;i<url.length();i++){       //计算出现? ch值63出现的次数
        char ch=url.charAt(i);
        int Nch=(int)ch;                  //出现?一次加1
        if(Nch==63){
         sum++;
        }
       }                                     //计算出现? ch值63出现的次数


       for(int i=0;i<url.length();i++){       //计算出现& 38出现的次数
        char ch=url.charAt(i);
        int Nch=(int)ch;                  //出现&一次加1
        if(Nch==38){
         sum++;
        }
       }                                     //计算出现? 38出现的次数
       if(url.indexOf("search")!=-1){        //出现 search 加2
        sum=sum+2;
       }
       if(url.indexOf("proxy")!=-1){         //出现proxy 加2
        sum=sum+2;
       }
       if(url.indexOf("gate")!=-1){          //出现 gate 加2
        sum=sum+2;
       }

       weight=weight-sum;                   //减总数
     }
    }catch(Exception ex){
     System.out.println("    Fail");
     System.out.println("   "+ex.toString());
     weight=0;
   //写系统日志
   Log="计算权重 "+url+" 权重异常 "+ex.toString();
   WriteLogo(LogoFileName,Log,isWriteLog);
    }
    if(weight<=0)
     weight=0;                          //网页权重(最少为零最大为20)
    System.out.println("    OK");
    return weight;

}
/**
*
* 计算url的层数，用于搜索的限定条件
* @param 输入url
* @return 输出网页层数
*/
public static int getUrlCheng(String url){
  int sum=0;
   //*******************************计算网址层数
  try{
   System.out.println();
   System.out.println("        正在计算层数 URL: "+url);
    for(int i=0;i<url.length();i++){
      char ch=url.charAt(i);
      int Nch=(int)ch;
      if(Nch==47){
       sum++;
      }
     }
      int lastCh=url.lastIndexOf("/")+1;
      if(url.indexOf("://")!=-1){
     sum=sum-2;
      }
      if(lastCh==url.length()){
       sum=sum-1;
      }
      if(url.indexOf("//")!=url.lastIndexOf("//")){
       sum=sum-1;
      }
  }catch(Exception ex){
   System.out.println("    Fail");
   System.out.println(ex.toString());
   //写系统日志
    Log="计算层数 "+url+" 异常 "+ex.toString();
    WriteLogo(LogoFileName,Log,isWriteLog);
  }
   //*******************************计算网址层数
  System.out.println("    OK");
  return sum;
}

/**
*
* 不受限制会返回 true
* 可以设置缓存的大小，但不能小于200
* @param args
*/
// 检测robot是否允许访问给出的URL.
//
static boolean isRobotAllowed(String webUrl,int MaxdisallowListCache) {
  boolean disallwoWebUrl=true;                 //默认网址是可以访问的
        if(MaxdisallowListCache<=200){          //检查输入的参数要是不设低于200
        MaxdisallowListCache=200;
        }
  try{
   System.out.println();
   System.out.println("    正在检查是否是受到robots协议的限制 URL: "+webUrl);
   URL urlToCheck = null;
   String webDisallowUrl=null;
   //如果缓存大于设定地址个数将清空缓存以防浪费硬件
   if(disallowListCache.size()>=MaxdisallowListCache){
    disallowListCache.clear();    //清空缓存
    System.out.println("   缓存己满清空");
    Log="缓存己满清空 ";
   WriteLogo(LogoFileName,Log,isWriteLog);
   }
   //首先检查缓存
   if(disallowListCache.size()>0){
    for(int i=0;i<disallowListCache.size();i++){
     String disAllowPath=disallowListCache.get(i).toString();
     if(webUrl.startsWith(disAllowPath)){
      disallwoWebUrl=false;
     }
    }
   }
   //如果缓存找到的话即不再往下执行
   if(disallwoWebUrl!=false){              //如果在缓存中没有找到再从网上找
    try {
     try{
        urlToCheck = new URL(webUrl);
     }catch(Exception ex){
      Log="无法获取 "+webUrl+" 的主机";
       WriteLogo(LogoFileName,Log,isWriteLog);
     }
            String host = urlToCheck.getHost().toLowerCase();// 获取给出RUL的主机
            URL robotsFileUrl=null;
            BufferedReader reader=null;
            try{
            robotsFileUrl= new URL("http://" + host + "/robots.txt");
            reader = new BufferedReader(
         new InputStreamReader(robotsFileUrl.openStream()));
            }catch(Exception ex){
            Log="无法获取 "+robotsFileUrl+" 的内容";
        WriteLogo(LogoFileName,Log,isWriteLog);
            }

      // 读robot文件，创建不允许访问的路径列表。
      String line;
      boolean robotsUser=false;          //默认不是对任意机器人的
      while ((line = reader.readLine()) != null) {
      // System.out.println(line.toString());
       line=line.trim();
       if(line.indexOf("User-agent:")!=-1){        //找到设置robots的行
        if(line.indexOf("*")!=-1){        //当robots协议机器人名是 * （所有）才会执行
         robotsUser=true;

        }else{
         robotsUser=false;             //若不是对任意机器人跳过
        }
       }
       if(robotsUser){              //若是针对任意机器人，执行
        if (line.indexOf("Disallow:") == 0) {// 是否包含"Disallow:"
         String disallowPath = line.substring("Disallow:"
           .length());// 获取不允许访问路径

         // 检查是否有注释。
         int commentIndex = disallowPath.indexOf("#");
         if (commentIndex != -1) {
          disallowPath = disallowPath.substring(0,
            commentIndex);// 去掉注释
         }
         disallowPath = disallowPath.trim();
         webDisallowUrl="http://"+host+disallowPath;
         if(!disallowListCache.contains(webDisallowUrl)){   //去掉重复的路径
          System.out.println();
          System.out.println("     正在将路径加入到缓存中 URL: "+webDisallowUrl);
          disallowListCache.add(webDisallowUrl);
          System.out.println("     OK");
          //System.out.println(webDisallowUrl);
         }
        }
       }

      }
      if(disallowListCache.size()>0){
       for(int i=0;i<disallowListCache.size();i++){
        String disAllowPath=disallowListCache.get(i).toString();
        if(webUrl.startsWith(disAllowPath)){
         disallwoWebUrl=false;
        }
       }
      }
     } catch (Exception e) {
      disallwoWebUrl=true; // web站点根目录下没有robots.txt文件,返回真
      //写系统日志

     }
        }
  }catch(Exception ex){
   System.out.println("    Fail");
   disallwoWebUrl=true;
   //写系统日志
    Log="检查URL "+webUrl+" 是否受robotx协议时异常 "+ex.toString();
    WriteLogo(LogoFileName,Log,isWriteLog);
  }
  System.out.println("    OK");
  return disallwoWebUrl;
}

    /**
     * 这里主要收集出相关的信息包括: URL的层数
*                               url权重
*                               以及url的父url入度(url数)
*                               父url
*                               此rul的HTML文本信息
     *
     * 主要运行程序
     */
public static void robots(){

  //*************计算时间获取系统运行时间
  long startTime=getNowTime();
  setTypeUrl();       //启动过滤后缀
  LogoFileName=createFile(isWriteLog); //生成本次蜘蛛运行的系统日志文件
  if(isWriteLog){
   if(LogoFileName==null){
    return;
   }
  }
  System.out.println(getExecInfo());
  allUrls.clear();
  hasSearchUrls.clear();
  String htmlCount=null;
  String rotempUrl=null;
  int i=0;
  try{
   allUrls.add(webUrl);
   System.out.println();
   System.out.println("    己添加到   AllURLS     "+webUrl );
   hasSearchUrls.add(webUrl);
   System.out.println("       己添加到   hasSearchUrls     "+webUrl );
   roallurls= getUrls(webUrl);
   System.out.println("    此 url "+webUrl+" 的连接数是 "+roallurls.size());
   roFilter=filterURL(roallurls,webUrl,maxURLCheng,
     UrlWeight,isHost,robots_txt,MaxdisallowListCache);
   if(roFilter.size()>0){
    for(i=0;i<roFilter.size();i++){
     rotempUrl=roFilter.get(i).toString();
     if(!allUrls.contains(rotempUrl)&&!rotempUrl.equals("")){   //不能为空
      allUrls.add(rotempUrl);
      System.out.println("己添加到     AllURLS   "+rotempUrl );
     }
    }
   }
    for(i=0;i<allUrls.size();i++){
     rotempUrl=allUrls.get(i).toString();
     if(!hasSearchUrls.contains(rotempUrl)&&!rotempUrl.equals("")){
      System.out.println();
      roallurls.clear();
      roFilter.clear();
      if(!hasSearchUrls.contains(rotempUrl)){     //不能搜索重复的URL
       roallurls=getUrls(rotempUrl);
       hasSearchUrls.add(rotempUrl);
       System.out.println();
       System.out.println("     己添加到   hasSearchUrls   "+rotempUrl );
       if(roallurls.size()>0){          //这里必须有url才执行
        System.out.println("   此 url "+rotempUrl+" 的连接数是 "+roallurls.size());
        roFilter=filterURL(roallurls,rotempUrl,maxURLCheng,
          UrlWeight,isHost,robots_txt,MaxdisallowListCache);
        if(roFilter.size()>0){        //url集合有url才继续执行
         for(i=0;i<roFilter.size();i++){
          rotempUrl=roFilter.get(i).toString();
          if(!allUrls.contains(rotempUrl)&&!rotempUrl.equals("")){
           allUrls.add(rotempUrl);
           if(checkChar!=null){                  //执行关键字查询
            try{
             htmlCount=getHtml(rotempUrl);   //获取全部HTML内容
             System.out.println("         正在检查HTML中是否包含有关键字: "+checkChar);
             if(htmlCount!=null&&htmlCount.indexOf(checkChar)!=-1){     //
              System.out.println("    OK");
              System.out.println("The url"+rotempUrl+"      key is "+checkChar);
              //这里可以存储关键字
             }
            }catch(Exception ex){
                                   System.out.println(" 检查关键字失败 "+ex.toString());
                                   //写系统日志
                                    Log="检查URL "+rotempUrl+" 是否有关键字:"+checkChar+" 异常 "+ex.toString();
                                    WriteLogo(LogoFileName,Log,isWriteLog);
            }
           }
           System.out.println("     己添加到    AllURLS   "+rotempUrl );
          }
         }
        }
       }
       System.out.println();
       System.out.println(diffTime(startTime)); //输出时间差信息
           System.out.println("              Has Search "+hasSearchUrls.size());
       System.out.println("              All url is "+allUrls.size());
       System.out.println();
      }
     }
     if(allUrls.size()>=NmaxUrl)
      break;
    }
   if(allUrls.size()>=NmaxUrl){
    System.out.println("    己达到设定搜索总数"+NmaxUrl+" 蜘蛛正常完成停止工作");
    SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    String ly_time = formatter.format(new java.util.Date());
    System.out.println("    DateTiem is :"+ly_time);
    //写系统日志
   Log=" 己达到设定值"+NmaxUrl+" 蜘蛛正常停止工作/r/n";
   Log+=" 结束时间是 "+ly_time+"/r/n";
   Log+=diffTime(startTime)+"/r/n";
   Log+=" 总下载页面大小是 "+(AllPagesize/1024/1024)+" MB /r/n";
   Log+=" 到过的相关页面的主机数是: "+Allhost.size()+" /r/n";
   Log+=" 己搜索的URL数是 "+hasSearchUrls.size()+"/r/n";
   Log+=" 总共URL总数是 "+allUrls.size()+"/r/n";
   WriteLogo(LogoFileName,Log,isWriteLog);
   }else{
    System.out.println("    己搜索总数 "+allUrls.size()+" 蜘蛛工作异常");
    SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    String ly_time = formatter.format(new java.util.Date());
    System.out.println("    DateTiem is :"+ly_time);
    //写系统日志
   Log=" 蜘蛛工作异常/r/n";
   Log+=" 结束时间是 "+ly_time+"/r/n";
   Log+=diffTime(startTime)+"/r/n";
   Log+=" 总下载页面大小是 "+(AllPagesize/1024/1024)+" MB /r/n";
   Log+=" 到过的相关页面的主机数是: "+Allhost.size()+" /r/n";
   Log+=" 己搜索的URL数是 "+hasSearchUrls.size()+"/r/n";
   Log+=" 总共URL总数是 "+allUrls.size()+"/r/n";
   WriteLogo(LogoFileName,Log,isWriteLog);
   }
  }catch(Exception ex){
   System.out.println("        蜘蛛运行异常");
   System.out.println(ex.toString());
   //写系统日志
    Log=" 蜘蛛运行异常 "+ex.toString();
    WriteLogo(LogoFileName,Log,isWriteLog);
  }
}

/**
* 获取当前系统时间以long形式返回
* @return
*/
public static long getNowTime()
{
  long startTimes=0;
  try{
    SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
     String ly_time = formatter.format(new java.util.Date());
     Date stime = null;
     stime = formatter.parse(ly_time);
     startTimes=stime.getTime();
  }catch(Exception ex){
   System.out.println("    获取初始时间出错");
   //写系统日志
    Log="获取初始时间出错"+ex.toString();
    WriteLogo(LogoFileName,Log,isWriteLog);
  }
   return startTimes;
}

/**
* 为过虑的网站添加后缀名
*
*/
static void setTypeUrl(){
  AllType.add("jpg");
  AllType.add("JGP");
  AllType.add("gif");
  AllType.add("GIF");
  AllType.add("jpeg");
  AllType.add("JPEG");
  AllType.add("bmp");
  AllType.add("BMP");
  AllType.add("png");
  AllType.add("PNG");
}

/**
*
* @param 开始时间 StartTime long
* @return 时间差的字符串
*/
public static String diffTime(long StartTime){
  long theTtartTime=StartTime;
  String output=null;
  long NowTime=0;
  try{
    SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
     String ly_time = formatter.format(new java.util.Date());
     Date stime = null;
     stime = formatter.parse(ly_time);
     NowTime=stime.getTime();
     long diffTime=(NowTime-theTtartTime)/1000;
     int min=(int) (diffTime/60);
     int sec=(int) (diffTime%60);
     output="              use "+min+"m "+sec+"s ";
  }catch(Exception ex){
   System.out.println("    计算时间差出错");
   //写系统日志
    Log=" 计算时间差出错 "+ex.toString();
    WriteLogo(LogoFileName,Log,isWriteLog);
  }
  return output;
}

/**
*
* 创建系统日志文件是以前的系统时间为文件名
* 返回文件名
* @throws IOException
*/

public static String createFile(boolean isWriteLog) {

  String FileDirName=null;
  if(isWriteLog){
   String fileDir=null;
   try{
     String oldFileDir="C://SpiderLog";
     if(!new File(oldFileDir).isDirectory()){   //若文件夹不存在,而创建文件夹
      new File(oldFileDir).mkdir();
     }
     SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH-mm-ss");
     String ly_time = formatter.format(new java.util.Date());
     fileDir=oldFileDir+"//"+ly_time+".txt";
     File filename = new File(fileDir);
     if(!filename.exists()){
      boolean rs=filename.createNewFile();
      if(rs){
       FileDirName=fileDir;
      }
     }else{
      FileDirName=fileDir;
     }
   }
   catch(Exception ex){
    System.out.println("创建系统日志文件失败 createFile");
    //写系统日志
   Log=" 创建系统日志文件失败"+ex.toString();
   WriteLogo(LogoFileName,Log,isWriteLog);
   }
  }
  return FileDirName;
}

/**
*
* @param fileName 文件路径
* @param err      文件内容
* @param isWriteLog 是否要写日志
*/
public static void WriteLogo(String LogoFileName,String Log,boolean isWriteLog){
  try{
   if(isWriteLog){

     SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH-mm-ss");
     String ly_time = formatter.format(new java.util.Date());
     FileWriter   fileWriter   =   new   FileWriter(LogoFileName,   true);
     BufferedWriter mBufWriter   =   new   BufferedWriter(fileWriter);
              mBufWriter.write(ly_time+" : "+Log+"/r/n");
              mBufWriter.newLine();
              mBufWriter.flush();
              mBufWriter.close();
   }

  }catch(Exception ex){
   System.out.println(" 写日志异常"+ex.toString());
  }
}
/**
* 获取蜘蛛初始化信息
* @return
*/
public static String getExecInfo()
{
  String info=null;
   SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
   String ly_time = formatter.format(new java.util.Date());

  info=" 开始日期 :"+ly_time+"/r/n";
  info+=" 初始主机是 "+webUrl+"/r/n";
  info+=" 系统缓存robots协议路径的数量是   "+MaxdisallowListCache+"/r/n";
  if(checkChar==null){
   info+=" 不进行关键字搜索 /r/n";
  }else{
   info+=" 启用关键字搜索关键是: "+checkChar+"/r/n";
  }
  info+=" 超时时间是 "+timeout+" 毫秒 /r/n";
  info+=" 搜索的ULR最大层数是 :"+maxURLCheng+"/r/n";
  info+=" 只搜索URL权重是 :"+UrlWeight+"/r/n";
  if(isHost==false){
   info+=" 限定主机搜索/r/n";
  }else{
   info+=" 限定主机搜索 /r/n";
  }
  if(robots_txt==false){
   info+=" 不进行URL的robots协议的检查 /r/n";
  }else{
   info+=" 进行URL的robots协议的检查 /r/n";
  }
  info+=" 搜索的最大URL数是 :"+NmaxUrl+"/r/n";
  info+=" 线程数是 :"+NThread+"/r/n";
  if(isWriteLog){
   info+=" 存储运行日志 /r/n";
   info+=" 系统日志路径是: "+LogoFileName+"/r/n";
   info+=" 写入日志页面下载量间隔是: "+NwriteLogSum+" KB/r/n";

  }else{
   info+=" 不存储蜘蛛运行日志/r/n";
  }
  info+=" Now Begin Search...";
  //写系统日志
  WriteLogo(LogoFileName,info,isWriteLog);    //写系统日志
  return info;
}
//线程
public void run(){
  robots();
}
public static void main(String[] args) throws IOException {
  Spider spider=new Spider();
  for(int i=1;i<=NThread;i++){
   Thread t=new Thread(spider);
   t.start();
  }

}

}