本程序是一个利用多线程实现网页抓取的爬虫程序。程序写的很具专业性,代码齐整,思路清晰,一目了然:输入起始URL—>获取对应的文件内容—>分离出新的URL,存入URL队列—>建立新的线程继续抓取新URL所对应的网页,直到抓取够预定的网页数。缺点是每抓取一组新的网页就要开启一个线程,不免有浪费系统资源的嫌疑。待改进……
接口:WebPage.java
- import java.io.File;
- import java.net.MalformedURLException;
- /*
- * 定义了WebPage对象的基本操作
- */
- public interface WebPage
- {
- /**根据网页地址将该网页转换成本地文件*/
- public File getPageFile();
- /**分析网页的内容
- * @throws MalformedURLException */
- public void parse() throws MalformedURLException;
- }
import java.io.File;
import java.net.MalformedURLException;
/*
* 定义了WebPage对象的基本操作
*/
public interface WebPage
{
/**根据网页地址将该网页转换成本地文件*/
public File getPageFile();
/**分析网页的内容
* @throws MalformedURLException */
public void parse() throws MalformedURLException;
}
实现以测试类:HTMLPage.java
- import java.io.File;
- import java.io.FileNotFoundException;
- import java.io.FileOutputStream;
- import java.io.IOException;
- import java.io.InputStream;
- import java.io.InputStreamReader;
- import java.io.OutputStreamWriter;
- import java.net.MalformedURLException;
- import java.net.URL;
- import java.util.HashSet;
- import java.util.Set;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- /**
- * 简单的蜘蛛爬虫,运行起来异常会很多,暂时先不管
- * @author gbk
- *
- */
- public class HTMLPage extends Thread implements WebPage
- {
- private static int pageId = 0;
- private static int MAX_PAGENUM = 1000;
- //存放处理过的URL,保证不重复
- private static Set<String> urls = new HashSet<String>();
- private File localFile;
- private StringBuffer contents;
- private URL url;
- public HTMLPage(URL url)
- {
- this.url = url;
- }
- /**
- * 将网页下载到本地,用来以后分析
- */
- public File getPageFile()
- {
- int ch = 0;
- contents = new StringBuffer();
- pageId++;
- localFile = new File("d:/html/"+pageId+".txt");
- try
- {
- InputStream inputStream = url.openStream();
- InputStreamReader inputStreamReader = new InputStreamReader(inputStream);
- FileOutputStream fileOutputStream = new FileOutputStream(localFile);
- OutputStreamWriter outputStreamWriter = new OutputStreamWriter(fileOutputStream);
- while((ch = inputStreamReader.read()) != -1)
- {
- contents.append((char)ch);
- outputStreamWriter.write(ch);
- }
- outputStreamWriter.close();
- fileOutputStream.close();
- inputStreamReader.close();
- inputStream.close();
- } catch (FileNotFoundException e)
- {
- e.printStackTrace();
- } catch (IOException e)
- {
- e.printStackTrace();
- }
- return localFile;
- }
- /**
- * 分析网页,将不重复的url地址添加到候选叶中
- */
- public void parse() throws MalformedURLException
- {
- //无法处理内部链接,即不带http
- String regex ="<a.*?href=http://.*?>.*?</a>";
- Pattern pt=Pattern.compile(regex,Pattern.CASE_INSENSITIVE);
- Matcher mt=pt.matcher(contents);
- while(mt.find())
- {
- //获取网址
- Matcher myurl=Pattern.compile("href=.*?>").matcher(mt.group());
- while(myurl.find())
- {
- String url = myurl.group().replaceAll("href=|>","");
- //没有做同步,所以最后会稍微多出几个文件
- if(!urls.contains(url)&&pageId<MAX_PAGENUM)
- {
- urls.add(url);
- //新建一个线程,重复上述操作
- HTMLPage page = new HTMLPage(new URL(url));
- page.start();
- }
- }
- System.out.println();
- }
- }
- public void run()
- {
- getPageFile();
- try
- {
- parse();
- } catch (MalformedURLException e)
- {
- e.printStackTrace();
- }
- }
- public static void main(String[] args) throws MalformedURLException
- {
- HTMLPage page = new HTMLPage(new URL("http://www.baidu.com"));
- page.start();
- }
- }