第一种方式是爬本地的网页,把网页保存再本地,然后爬取。
核心代码:
BufferedReader buf=new BufferedReader(new File());
String st=buf.readLine();
Pattern p=Pattern.compile(Regex);
Matcher ma=p.matcher(st);
while(ma.find())
{print(ma.group());}
package com.wanhao;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class RegexDemo {
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
try {
regex_test();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static void regex_test() throws IOException {
BufferedReader bufIn=new BufferedReader(new FileReader("f:\\mail.html"));
String regex="\\w+@\\w+(\\.\\w+)+";
List<String>lis=new ArrayList<String>();
Pattern p=Pattern.compile(regex);
String mail=null;
while((mail=bufIn.readLine())!=null)
{
Matcher ma= p.matcher(mail);
while(ma.find())
{
lis.add(ma.group());
}
}
for(String s:lis)
{
System.out.println(s);
}
}
}
第二种直接爬取网站上的邮箱信息:
基本思路是相同的,不同处在于BufferedReader的获取方式不同了。
URL url=new URL("要爬的网址");
BufferedReader buf=new BufferedReader(new InputStreamReader(url.openStream()));
package com.wanhao;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class RegexDemo {
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
try {
regex_test();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static void regex_test() throws IOException {
URL url=new URL("http://tieba.baidu.com/p/4221284005?fr=ala0&pstaala=2&tpl=5");
BufferedReader bufIn=new BufferedReader(new InputStreamReader(url.openStream()));
String regex="\\w+@\\w+(\\.\\w+)+";
List<String>lis=new ArrayList<String>();
Pattern p=Pattern.compile(regex);
String mail=null;
while((mail=bufIn.readLine())!=null)
{
Matcher ma= p.matcher(mail);
while(ma.find())
{
lis.add(ma.group());
}
}
for(String s:lis)
{
System.out.println(s);
}
}
}