利用htmlparser爬虫获取指定完整的完整区域信息

本文展示了一个使用Java实现的简单网页爬虫程序,该程序能够从指定网站抓取链接并解析页面内容,提取出地区编码等信息。通过递归访问网站上的链接,程序能够收集并整理相关信息。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

package com.hundsun.pc;

import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;

public class Url {
	ArrayList<String> links= new ArrayList<String>();  
	final static String url="http://www.diqudaima.com";
	public static void main(String[] args) {
		String vistor = null ;
		try {
			LinkQueue queue = new LinkQueue();
			queue.addUnvisitedUrl("/zhejiang/hangzhoushi/");
			do{
				try {
					HttpClient http = new DefaultHttpClient();
					vistor = queue.unVisitedUrlDeQueue().toString();
					System.out.println("开始访问:" + url + vistor);
					HttpGet hg = new HttpGet(url + vistor);
					HttpResponse hr;
					hr = http.execute(hg);
					HttpEntity he = hr.getEntity();//
					Url Url = new Url();
					if (he != null) {
						String charset = EntityUtils.getContentCharSet(he);
						InputStream is = he.getContent();
						BufferedReader br = new BufferedReader(
								new InputStreamReader(is, "GBK"));
						String line = null;
						int i = 0;
						while ((line = br.readLine()) != null) {
							Url.geUrl(line, queue);
						}
					}
					queue.addVisitedUrl(vistor);
					http.getConnectionManager().shutdown();
					System.out.println("获取区域信息数:"+queue.getMap().size()+"已经访问链接数:"+queue.getVisitedUrl().size()+"异常链接数:"+queue.getErrorUrl().size());
				} catch (Exception e) {
					System.out.println("访问异常:"+url+vistor);
					queue.adderrorUrl(vistor);
					queue.addUnvisitedUrl(vistor);
				} 
			}while(!queue.isUnvisitedUrlsEmpty());
		} catch (Exception e) {
			e.printStackTrace();
		}finally{
			
		}
	}
	public  void geUrl(String html,LinkQueue queue){
		try{
			 NodeFilter filter = new TagNameFilter("li");
			 Parser p=new Parser();  
		     p.setInputHTML(html);  
		     NodeList list = p.extractAllNodesThatMatch(filter);  
		     for (int i = 0; i < list.size(); i++) {  
		    	 Node textnode = (Node) list.elementAt(i);
		    	 NodeList listChildren =textnode.getChildren();
		    	 String key="";
		    	 String value="";
		    	 for (int j = 0; j < listChildren.size(); j++) {  
		    		 Node textnodeChildren = (Node) listChildren.elementAt(j);
		    		 if(textnodeChildren.getClass()==LinkTag.class){
		    			 LinkTag nodeChildren = (LinkTag)textnodeChildren;
		    			 key=nodeChildren.getLinkText();
		    			 queue.addUnvisitedUrl(nodeChildren.getLink());
		    		 }else{
		    			 value=textnodeChildren.getText();
		    			 if(value.split("邮编").length>1)
		    				 value= value.substring(value.indexOf("地区编码:")+5, value.indexOf("邮编:"));
		    			 if(value.startsWith("[")){
		    				 value=value.replace("[", "");
			    			 value=value.replace("]", "");
		    			 }
		    			 if(key.equals("")){
		    				 String[] args =  value.split("\\[");
		    				 key = args[0];
		    				 value =  args[1].replace("]", "");
		    			 }
		    		 }
		    	 }
		    	 System.out.println("key="+key+"       value="+value);
		    	 if(!key.equals(""))
		    		 queue.getMap().put(key, value);
            }  
		}catch (Exception e) {
			e.printStackTrace();
		}
		 
	}

辅助类

package com.hundsun.pc;

import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

/**
 * Created by amosli on 14-7-9.
 */
public class LinkQueue {
    //已经访问的队列
    private static Set visitedUrl = new HashSet();
    //已经访问的队列
    private static Set errorUrl = new HashSet();
    //未访问的队列
    private static Queue unVisitedUrl = new Queue();
    private Map map = new HashMap<String,String>();
    //获得URL队列
    public static Queue getUnVisitedUrl() {
        return unVisitedUrl;
    }
    public static Set getVisitedUrl() {
        return visitedUrl;
    }
    //添加到访问过的URL队列中
    public static void addVisitedUrl(String url) {
        visitedUrl.add(url);
    }
    public static void adderrorUrl(String url) {
    	errorUrl.add(url);
    }

    //删除已经访问过的URL
    public static void removeVisitedUrl(String url){
        visitedUrl.remove(url);
    }
    //未访问的URL出队列
    public static Object unVisitedUrlDeQueue(){
        return unVisitedUrl.deQueue();
    }
    //保证每个URL只被访问一次,url不能为空,同时已经访问的URL队列中不能包含该url,而且因为已经出队列了所未访问的URL队列中也不能包含该url
    public static void addUnvisitedUrl(String url){
        if(url!=null&&!url.trim().equals("")&&!visitedUrl.contains(url)&&!unVisitedUrl.contains(url))
        unVisitedUrl.enQueue(url);
    }
    //获得已经访问过的URL的数量
    public static int getVisitedUrlNum(){
        return visitedUrl.size();
    }

    //判断未访问的URL队列中是否为空
    public static boolean isUnvisitedUrlsEmpty(){
        return unVisitedUrl.empty();
    }
	public Map getMap() {
		return map;
	}
	public void setMap(Map map) {
		this.map = map;
	}
	public static Set getErrorUrl() {
		return errorUrl;
	}
	public static void setErrorUrl(Set errorUrl) {
		LinkQueue.errorUrl = errorUrl;
	}
    
    
}


运行结果如下图



org.htmlparser.Tag org.htmlparser.Node org.htmlparser.Text org.htmlparser.Parser org.htmlparser.Remark org.htmlparser.tags.Div org.htmlparser.Attribute org.htmlparser.tags.Html org.htmlparser.tags.Span org.htmlparser.NodeFilter org.htmlparser.lexer.Page org.htmlparser.NodeFactory org.htmlparser.http.Cookie org.htmlparser.lexer.Lexer org.htmlparser.sax.Locator org.htmlparser.tags.Bullet org.htmlparser.tags.JspTag org.htmlparser.lexer.Cursor org.htmlparser.lexer.Source org.htmlparser.lexer.Stream org.htmlparser.sax.Feedback org.htmlparser.tags.BodyTag org.htmlparser.tags.FormTag org.htmlparser.tags.HeadTag org.htmlparser.tags.LinkTag org.htmlparser.tags.MetaTag org.htmlparser.nodes.TagNode org.htmlparser.sax.XMLReader org.htmlparser.tags.FrameTag org.htmlparser.tags.ImageTag org.htmlparser.tags.InputTag org.htmlparser.tags.LabelTag org.htmlparser.tags.StyleTag org.htmlparser.tags.TableRow org.htmlparser.tags.TableTag org.htmlparser.tags.TitleTag org.htmlparser.util.NodeList org.htmlparser.beans.LinkBean org.htmlparser.nodes.TextNode org.htmlparser.sax.Attributes org.htmlparser.tags.AppletTag org.htmlparser.tags.ObjectTag org.htmlparser.tags.OptionTag org.htmlparser.tags.ScriptTag org.htmlparser.tags.SelectTag org.htmlparser.util.Translate org.htmlparser.util.sort.Sort org.htmlparser.beans.BeanyBaby org.htmlparser.http.HttpHeader org.htmlparser.lexer.PageIndex org.htmlparser.tags.BulletList org.htmlparser.tags.DoctypeTag org.htmlparser.tags.HeadingTag org.htmlparser.util.NodeList$1 org.htmlparser.beans.FilterBean org.htmlparser.beans.StringBean org.htmlparser.filters.OrFilter org.htmlparser.nodes.RemarkNode org.htmlparser.scanners.Scanner org.htmlparser.tags.BaseHrefTag org.htmlparser.tags.FrameSetTag org.htmlparser.tags.TableColumn org.htmlparser.tags.TableHeader org.htmlparser.tags.TextareaTag org.htmlparser.util.ParserUtils org.htmlparser.beans.BeanyBaby$1 org.htmlparser.filters.AndFilter org.htmlparser.filters.NotFilter org.htmlparser.filters.XorFilter org.htmlparser.tags.CompositeTag org.htmlparser.tags.ParagraphTag org.htmlparser.util.IteratorImpl org.htmlparser.util.NodeIterator org.htmlparser.visitors.HtmlPage org.htmlparser.util.sort.Ordered org.htmlparser.beans.HTMLLinkBean org.htmlparser.beans.HTMLTextBean org.htmlparser.lexer.StringSource org.htmlparser.nodes.AbstractNode org.htmlparser.util.sort.Sortable org.htmlparser.filters.RegexFilter org.htmlparser.lexer.PageAttribute org.htmlparser.scanners.JspScanner org.htmlparser.scanners.TagScanner org.htmlparser.tags.DefinitionList org.htmlparser.util.NodeTreeWalker org.htmlparser.util.ParserFeedback org.htmlparser.filters.StringFilter org.htmlparser.util.FeedbackManager org.htmlparser.util.ParserException org.htmlparser.visitors.NodeVisitor org.htmlparser.filters.IsEqualFilter org.htmlparser.filters.TagNameFilter org.htmlparser.scanners.StyleScanner org.htmlparser.util.ChainedException org.htmlparser.filters.HasChildFilter org.htmlparser.http.ConnectionManager org.htmlparser.http.ConnectionMonitor org.htmlparser.scanners.ScriptDecoder org.htmlparser.scanners.ScriptScanner org.htmlparser.PrototypicalNodeFactory org.htmlparser.filters.HasParentFilter org.htmlparser.filters.LinkRegexFilter org.htmlparser.filters.NodeClassFilter org.htmlparser.lexer.InputStreamSource org.htmlparser.util.CharacterReference org.htmlparser.util.SimpleNodeIterator org.htmlparser.filters.HasSiblingFilter org.htmlparser.filters.LinkStringFilter org.htmlparser.tags.DefinitionListBullet org.htmlparser.util.CharacterReferenceEx org.htmlparser.filters.HasAttributeFilter org.htmlparser.util.DefaultParserFeedback org.htmlparser.visitors.TagFindingVisitor org.htmlparser.visitors.LinkFindingVisitor org.htmlparser.scanners.CompositeTagScanner org.htmlparser.util.EncodingChangeException org.htmlparser.visitors.UrlModifyingVisitor org.htmlparser.filters.CssSelectorNodeFilter org.htmlparser.tags.ProcessingInstructionTag org.htmlparser.visitors.ObjectFindingVisitor org.htmlparser.visitors.StringFindingVisitor org.htmlparser.visitors.TextExtractingVisitor org.htmlparser.filters.CssSelectorNodeFilter$1 org.htmlparser.parserapplications.SiteCapturer org.htmlparser.parserapplications.WikiCapturer org.htmlparser.parserapplications.LinkExtractor org.htmlparser.parserapplications.LinkExtractor$1 org.htmlparser.parserapplications.StringExtractor org.htmlparser.filters.CssSelectorNodeFilter$YesFilter org.htmlparser.parserapplications.filterbuilder.Filter org.htmlparser.filters.CssSelectorNodeFilter$AdjacentFilter org.htmlparser.parserapplications.SiteCapturer$LocalLinkTag org.htmlparser.parserapplications.SiteCapturer$LocalFrameTag org.htmlparser.parserapplications.SiteCapturer$LocalImageTag org.htmlparser.parserapplications.filterbuilder.FilterBuilder org.htmlparser.parserapplications.filterbuilder.HtmlTreeModel org.htmlparser.parserapplications.filterbuilder.SubFilterList org.htmlparser.filters.CssSelectorNodeFilter$AttribMatchFilter org.htmlparser.filters.CssSelectorNodeFilter$HasAncestorFilter org.htmlparser.parserapplications.SiteCapturer$LocalBaseHrefTag org.htmlparser.parserapplications.filterbuilder.HtmlTreeCellRenderer org.htmlparser.parserapplications.filterbuilder.wrappers.OrFilterWrapper org.htmlparser.parserapplications.filterbuilder.layouts.NullLayoutManager org.htmlparser.parserapplications.filterbuilder.wrappers.AndFilterWrapper org.htmlparser.parserapplications.filterbuilder.wrappers.NotFilterWrapper org.htmlparser.parserapplications.filterbuilder.wrappers.RegexFilterWrapper org.htmlparser.parserapplications.filterbuilder.wrappers.StringFilterWrapper org.htmlparser.parserapplications.filterbuilder.layouts.VerticalLayoutManager org.htmlparser.parserapplications.filterbuilder.wrappers.TagNameFilterWrapper org.htmlparser.parserapplications.filterbuilder.wrappers.HasChildFilterWrapper org.htmlparser.parserapplications.filterbuilder.wrappers.HasParentFilterWrapper org.htmlparser.parserapplications.filterbuilder.wrappers.NodeClassFilterWrapper org.htmlparser.parserapplications.filterbuilder.wrappers.HasSiblingFilterWrapper org.htmlparser.parserapplications.filterbuilder.wrappers.HasAttributeFilterWrapper
按DOM模型解析html文件的工具包 已下是源码列表: META-INF/MANIFEST.MF META-INF/maven/org.htmlparser/htmlparser/pom.properties META-INF/maven/org.htmlparser/htmlparser/pom.xml org.htmlparser.Parser.class org.htmlparser.PrototypicalNodeFactory.class org.htmlparser.beans.BeanyBaby.class org.htmlparser.beans.FilterBean.class org.htmlparser.beans.HTMLLinkBean.class org.htmlparser.beans.HTMLTextBean.class org.htmlparser.beans.LinkBean.class org.htmlparser.beans.StringBean.class org.htmlparser.filters.AndFilter.class org.htmlparser.filters.CssSelectorNodeFilter.class org.htmlparser.filters.HasAttributeFilter.class org.htmlparser.filters.HasChildFilter.class org.htmlparser.filters.HasParentFilter.class org.htmlparser.filters.HasSiblingFilter.class org.htmlparser.filters.IsEqualFilter.class org.htmlparser.filters.LinkRegexFilter.class org.htmlparser.filters.LinkStringFilter.class org.htmlparser.filters.NodeClassFilter.class org.htmlparser.filters.NotFilter.class org.htmlparser.filters.OrFilter.class org.htmlparser.filters.RegexFilter.class org.htmlparser.filters.StringFilter.class org.htmlparser.filters.TagNameFilter.class org.htmlparser.http.HttpHeader.class org.htmlparser.sax.Attributes.class org.htmlparser.sax.Feedback.class org.htmlparser.sax.Locator.class org.htmlparser.sax.XMLReader.class org.htmlparser.scanners.CompositeTagScanner.class org.htmlparser.scanners.JspScanner.class org.htmlparser.scanners.ScriptDecoder.class org.htmlparser.scanners.ScriptScanner.class org.htmlparser.scanners.StyleScanner.class org.htmlparser.tags.AppletTag.class org.htmlparser.tags.BaseHrefTag.class org.htmlparser.tags.BlockquoteTag.class org.htmlparser.tags.BodyTag.class org.htmlparser.tags.Bullet.class org.htmlparser.tags.BulletList.class org.htmlparser.tags.CompositeTag.class org.htmlparser.tags.DefinitionList.class org.htmlparser.tags.DefinitionListBullet.class org.htmlparser.tags.Div.class org.htmlparser.tags.DoctypeTag.class org.htmlparser.tags.FormTag.class org.htmlparser.tags.FrameSetTag.class org.htmlparser.tags.FrameTag.class org.htmlparser.tags.HeadTag.class org.htmlparser.tags.HeadingTag.class org.htmlparser.tags.Html.class org.htmlparser.tags.ImageTag.class org.htmlparser.tags.InputTag.class org.htmlparser.tags.JspTag.class org.htmlparser.tags.LabelTag.class org.htmlparser.tags.LinkTag.class org.htmlparser.tags.MetaTag.class org.htmlparser.tags.ObjectTag.class org.htmlparser.tags.OptionTag.class org.htmlparser.tags.ParagraphTag.class org.htmlparser.tags.ProcessingInstructionTag.class org.htmlparser.tags.ScriptTag.class org.htmlparser.tags.SelectTag.class org.htmlparser.tags.Span.class org.htmlparser.tags.StyleTag.class org.htmlparser.tags.TableColumn.class org.htmlparser.tags.TableHeader.class org.htmlparser.tags.TableRow.class org.htmlparser.tags.TableTag.class org.htmlparser.tags.TextareaTag.class org.htmlparser.tags.TitleTag.class org.htmlparser.util.CharacterReference.class org.htmlparser.util.CharacterReferenceEx.class org.htmlparser.util.DefaultParserFeedback.class org.htmlparser.util.FeedbackManager.class org.htmlparser.util.IteratorImpl.class org.htmlparser.util.NodeTreeWalker.class org.htmlparser.util.ParserFeedback.class org.htmlparser.util.ParserUtils.class org.htmlparser.util.Translate.class org.htmlparser.visitors.HtmlPage.class org.htmlparser.visitors.LinkFindingVisitor.class org.htmlparser.visitors.ObjectFindingVisitor.class org.htmlparser.visitors.StringFindingVisitor.class org.htmlparser.visitors.TagFindingVisitor.class org.htmlparser.visitors.TextExtractingVisitor.class org.htmlparser.visitors.UrlModifyingVisitor.class org/htmlparser/beans/images/Chain16.gif org/htmlparser/beans/images/Chain32.gif org/htmlparser/beans/images/Knot16.gif org/htmlparser/beans/images/Knot32.gif
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值