1.*采用Visitor方式访问Html2.3.try{4. Parser parser=newParser();5. parser.setURL(”http://www.google.com”);6. parser.setEncoding(parser.getEncoding());7. NodeVisitor visitor=newNodeVisitor() {8.publicvoidvisitTag(Tag tag) {9. logger.fatal(”testVisitorAll() Tag name is :”10.+tag.getTagName()+” \n Class is :”11.+tag.getClass());12. }13.14. };15.16. parser.visitAllNodesWith(visitor);17. }catch(ParserException e) {18. e.printStackTrace();19. }20.21.*采用Filter方式访问html22.23.try{24.25. NodeFilter filter=newNodeClassFilter(LinkTag.class);26. Parser parser=newParser();27. parser.setURL(”http://www.google.com”);28. parser.setEncoding(parser.getEncoding());29. NodeList list=parser.extractAllNodesThatMatch(filter);30.for(inti=0; i
149.*/150.publicvoidtestImageVisitor() {151.try{152. ImageTag imgLink;153. ObjectFindingVisitor visitor=newObjectFindingVisitor(154. ImageTag.class);155. Parser parser=newParser();156. parser.setURL(”http://www.google.com”);157. parser.setEncoding(parser.getEncoding());158. parser.visitAllNodesWith(visitor);159. Node[] nodes=visitor.getTags();160.for(inti=0; i
176.*/177.publicvoidtestNodeFilter() {178.try{179. NodeFilter filter=newTagNameFilter(”IMG”);180. Parser parser=newParser();181. parser.setURL(”http://www.google.com”);182. parser.setEncoding(parser.getEncoding());183. NodeList list=parser.extractAllNodesThatMatch(filter);184.for(inti=0; i
194.*/195.publicvoidtestLinkTag() {196.try{197.198. NodeFilter filter=newNodeClassFilter(LinkTag.class);199. Parser parser=newParser();200. parser.setURL(”http://www.google.com”);201. parser.setEncoding(parser.getEncoding());202. NodeList list=parser.extractAllNodesThatMatch(filter);203.for(inti=0; i用法
214.*/215.publicvoidtestLinkCSS() {216.try{217.218. Parser parser=newParser();219. parser220. .setInputHTML(”
Link Test”221.+“”222.+“”223.+“”+“”);224. parser.setEncoding(parser.getEncoding());225. NodeList nodeList=null;226.227.for(NodeIterator e=parser.elements(); e.hasMoreNodes();) {228. Node node=e.nextNode();229. logger230. .fatal(”testLinkCSS()”+node.getText()231.+node.getClass());232.233. }234. }catch(Exception e) {235. e.printStackTrace();236. }237. }238./*239. * 测试OrFilter的用法240.*/241.publicvoidtestOrFilter() {242. NodeFilter inputFilter=newNodeClassFilter(InputTag.class);243. NodeFilter selectFilter=newNodeClassFilter(SelectTag.class);244. Parser myParser;245. NodeList nodeList=null;246.247.try{248. Parser parser=newParser();249. parser250. .setInputHTML(”
OrFilter Test”251.+“”252.+“”253.+“”254.+“”255.+“”256.+“”257.+“12”258.+“ yeeach.com”259.+“”);260.261. parser.setEncoding(parser.getEncoding());262. OrFilter lastFilter=newOrFilter();263. lastFilter.setPredicates(newNodeFilter[] { selectFilter,264. inputFilter });265. nodeList=parser.parse(lastFilter);266.for(inti=0; i<=nodeList.size(); i++) {267.if(nodeList.elementAt(i)instanceofInputTag) {268. InputTag tag=(InputTag) nodeList.elementAt(i);269. logger.fatal(”OrFilter tag name is :”+tag.getTagName()270.+” ,tag value is:”+tag.getAttribute(”value”));271. }272.if(nodeList.elementAt(i)instanceofSelectTag) {273. SelectTag tag=(SelectTag) nodeList.elementAt(i);274. NodeList list=tag.getChildren();275.276.for(intj=0; j的解析292.*/293.publicvoidtestTable() {294. Parser myParser;295. NodeList nodeList=null;296. myParser=Parser.createParser(”
”+“1-11 | 1-12 | 1-13 |
1-21 | 1-22 | 1-23 |
1-31 | 1-32 | 1-33 |
2-11 | 2-12 | 2-13 |
2-21 | 2-22 | 2-23 |
2-31 | 2-32 | 2-33 |
333.*/334.publicvoidtestVisitorAll() {335.try{336. Parser parser=newParser();337. parser.setURL(”http://www.google.com”);338. parser.setEncoding(parser.getEncoding());339. NodeVisitor visitor=newNodeVisitor() {340.publicvoidvisitTag(Tag tag) {341. logger.fatal(”testVisitorAll() Tag name is :”342.+tag.getTagName()+” \n Class is :”343.+tag.getClass());344. }345.346. };347.348. parser.visitAllNodesWith(visitor);349. }catch(ParserException e) {350. e.printStackTrace();351. }352. }353./*354. * 测试对指定Tag的NodeVisitor的用法
355.*/356.publicvoidtestTagVisitor() {357.try{358.359. Parser parser=newParser(360. “
dddd”361.+“”362.+“”363.+“”+“”364.+“ yeeach.com”365.+“”);366. NodeVisitor visitor=newNodeVisitor() {367.publicvoidvisitTag(Tag tag) {368.if(taginstanceofHeadTag) {369. logger.fatal(”visitTag() HeadTag : Tag name is :”370.+tag.getTagName()+” \n Class is :”371.+tag.getClass()+“\n Text is :”372.+tag.getText());373. }elseif(taginstanceofTitleTag) {374. logger.fatal(”visitTag() TitleTag : Tag name is :”375.+tag.getTagName()+” \n Class is :”376.+tag.getClass()+“\n Text is :”377.+tag.getText());378.379.380. }elseif(taginstanceofLinkTag) {381. logger.fatal(”visitTag() LinkTag : Tag name is :”382.+tag.getTagName()+” \n Class is :”383.+tag.getClass()+“\n Text is :”384.+tag.getText()+” \n getAttribute is :”385.+tag.getAttribute(”href”));386. }else{387. logger.fatal(”visitTag() : Tag name is :”388.+tag.getTagName()+” \n Class is :”389.+tag.getClass()+“\n Text is :”390.+tag.getText());391. }392.393. }394.395. };396.397. parser.visitAllNodesWith(visitor);398. }catch(Exception e) {399. e.printStackTrace();400. }401. }402./*403. * 测试HtmlPage的用法404.*/405.publicvoidtestHtmlPage() {406. String inputHTML=“”+“
”407.+“Welcome to the HTMLParser website”408.+“”+“”+“Welcome to HTMLParser”409.+“1-11 | 1-12 | 1-13 |
1-21 | 1-22 | 1-23 |
1-31 | 1-32 | 1-33 |
2-11 | 2-12 | 2-13 |
2-21 | 2-22 | 2-23 |
2-31 | 2-32 | 2-33 |
440.*/441.publicvoidtestLinkBean() {442. Parser parser=newParser();443.444. LinkBean linkBean=newLinkBean();445. linkBean.setURL(”http://www.google.com”);446. URL[] urls=linkBean.getLinks();447.448.for(inti=0; i
5、相关的项目
nekohtml
:评价比htmlparser好,把html正规化标准的xml文档,用xerces处理,但文档较少。
其他一些html parser可以参考相关的汇总文章:
6、参考文档
crawler,爬虫,htmlparser,nekohtml,scrape,scraping,spider