用多线程实现的Java爬虫程序2

最新推荐文章于 2022-11-18 19:32:46 发布

转载最新推荐文章于 2022-11-18 19:32:46 发布 · 447 阅读

java相关专栏收录该内容

126 篇文章

订阅专栏

本文介绍了一个使用多线程技术实现的网页抓取爬虫程序。该程序能够从指定的URL开始抓取网页内容，并从中提取新的URL进行进一步抓取，直至达到预设的网页数量。程序通过创建新的线程来处理每个新发现的URL。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

本程序是一个利用多线程实现网页抓取的爬虫程序。程序写的很具专业性，代码齐整，思路清晰，一目了然：输入起始URL—>获取对应的文件内容—>分离出新的URL，存入URL队列—>建立新的线程继续抓取新URL所对应的网页，直到抓取够预定的网页数。缺点是每抓取一组新的网页就要开启一个线程，不免有浪费系统资源的嫌疑。待改进……

接口：WebPage.java

Java代码

import java.io.File;
import java.net.MalformedURLException;
/*
* 定义了WebPage对象的基本操作
*/
public interface WebPage
{
/**根据网页地址将该网页转换成本地文件*/
public File getPageFile();
/**分析网页的内容
* @throws MalformedURLException */
public void parse() throws MalformedURLException;
}

import java.io.File;
import java.net.MalformedURLException;

/*
 * 定义了WebPage对象的基本操作
 */
public interface WebPage
{
 /**根据网页地址将该网页转换成本地文件*/
 public File getPageFile();
 
 /**分析网页的内容
  * @throws MalformedURLException */
 public void parse() throws MalformedURLException;
}

实现以测试类:HTMLPage.java

Java代码

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 简单的蜘蛛爬虫，运行起来异常会很多，暂时先不管
* @author gbk
*
*/
public class HTMLPage extends Thread implements WebPage
{
private static int pageId = 0;
private static int MAX_PAGENUM = 1000;
//存放处理过的URL，保证不重复
private static Set<String> urls = new HashSet<String>();
private File localFile;
private StringBuffer contents;
private URL url;
public HTMLPage(URL url)
{
this.url = url;
}
/**
* 将网页下载到本地，用来以后分析
*/
public File getPageFile()
{
int ch = 0;
contents = new StringBuffer();
pageId++;
localFile = new File("d:/html/"+pageId+".txt");
try
{
InputStream inputStream = url.openStream();
InputStreamReader inputStreamReader = new InputStreamReader(inputStream);
FileOutputStream fileOutputStream = new FileOutputStream(localFile);
OutputStreamWriter outputStreamWriter = new OutputStreamWriter(fileOutputStream);
while((ch = inputStreamReader.read()) != -1)
{
contents.append((char)ch);
outputStreamWriter.write(ch);
}
outputStreamWriter.close();
fileOutputStream.close();
inputStreamReader.close();
inputStream.close();
} catch (FileNotFoundException e)
{
e.printStackTrace();
} catch (IOException e)
{
e.printStackTrace();
}
return localFile;
}
/**
* 分析网页，将不重复的url地址添加到候选叶中
*/
public void parse() throws MalformedURLException
{
//无法处理内部链接，即不带http
String regex ="<a.*?href=http://.*?>.*?</a>";
Pattern pt=Pattern.compile(regex,Pattern.CASE_INSENSITIVE);
Matcher mt=pt.matcher(contents);
while(mt.find())
{
//获取网址
Matcher myurl=Pattern.compile("href=.*?>").matcher(mt.group());
while(myurl.find())
{
String url = myurl.group().replaceAll("href=|>","");
//没有做同步，所以最后会稍微多出几个文件
if(!urls.contains(url)&&pageId<MAX_PAGENUM)
{
urls.add(url);
//新建一个线程，重复上述操作
HTMLPage page = new HTMLPage(new URL(url));
page.start();
}
}
System.out.println();
}
}
public void run()
{
getPageFile();
try
{
parse();
} catch (MalformedURLException e)
{
e.printStackTrace();
}
}
public static void main(String[] args) throws MalformedURLException
{
HTMLPage page = new HTMLPage(new URL("http://www.baidu.com"));
page.start();
}
}