web

网页爬虫实战：邮箱抓取

最新推荐文章于 2024-05-21 20:38:51 发布

转载最新推荐文章于 2024-05-21 20:38:51 发布 · 706 阅读

/*网页爬虫
*
* 爬邮箱、爬关键字

* */

/* 1、读取文件
2、对读取的数据进行规则匹配，从中获取符合规则的数据
3、将符合规则的数据储存到集合中*/

[java]view plaincopyprint? 
   
 package regexTest;  
   
 import java.io.BufferedReader;  
 import java.io.FileNotFoundException;  
 import java.io.FileReader;  
 import java.io.InputStreamReader;  
 import java.net.URL;  
 import java.util.ArrayList;  
 import java.util.List;  
 import java.util.regex.Matcher;  
 import java.util.regex.Pattern;  
   
 /** 
  * @author Administrator  Alt+Shift+j 
  * 
  */  
   
 /*网页爬虫 
  *  
  * 爬邮箱、爬关键字 
  * */  
 public class RegexTest {  
       
     public static void main(String[] args){  
           
         try {  
             List<String> list=getMail();  
             System.out.println("爬到的资源");  
             for(String mail:list){  
                 System.out.println("====>"+mail);  
             }  
         } catch (Exception e) {  
             // TODO Auto-generated catch block  
             e.printStackTrace();  
         }  
           
     }  
 /////////////////////////////////////////////     
 /*  1、读取文件 
     2、对读取的数据进行规则匹配，从中获取符合规则的数据 
     3、将符合规则的数据储存到集合中*/  
       
     public static List<String> getMail() throws Exception{  
         ///从本地文件中爬  
         //BufferedReader br=new BufferedReader(new FileReader("d:\\mail.html"));  
           
         ///从网络文件中爬  
         URL url=new URL("http://www.hao123.com/index.html");  
         BufferedReader br=new BufferedReader(new InputStreamReader(url.openStream()));  
           
         String mail_regex="\\w+@\\w+(\\.\\w+)+";///邮箱的匹配规则  
         Pattern p =Pattern.compile(mail_regex);//将规则封装成对象  
           
         List<String> list=new ArrayList<String>();  
           
         String line=null;  
         while((line=br.readLine())!=null){  
             Matcher m=p.matcher(line);//一行一行地进行匹配  
             while(m.find()){  
                 //m.group()找到就放进组里  
                 list.add(m.group());  
             }  
         }  
         return list;  
     }  
       
 }