java+Jsoup 正则过滤html网页标签

  1. //门票浏览 url参数 http://www.lvmama.com/dest/lantiancheng
  2. public static DataBean getWebData1(String url){
  3. DataBean data = null;
  4. try {
  5. Document docdata = Jsoup.connect(url).timeout(20000).get();
  6. String id = ".quick-menu .last a";
  7. String city = ".proDetail a";
  8. String title = ".proDetail h1";
  9. String content = "#Introduction";
  10. Elements ele = docdata.select(city);
  11. city = (ele.text());
  12. ele = docdata.select(title);
  13. title = (ele.text()).replace("<", "").replace(">", "").replace("\\/", "");
  14. ele = docdata.select(content);
  15. content = (ele.text());
  16. ele = docdata.select(id);
  17. String[] idary = ele.attr("href").split("\\/");
  18. id = idary[idary.length-1];
  19. String type = "景点门票";
  20. List<String> images = new ArrayList<String>();
  21. data = new DataBean(id,title,url,content,type,city,images);
  22. Dom4jUtil.createFile("data/["+type+"]"+title+".xml", Dom4jUtil.createDocument(data));
  23. type = "景点评论";
  24. content = ".userComments dl dd:eq(3)";
  25. ele = docdata.select(content);
  26. content = "";
  27. int i = 1;
  28. for (Element el : ele) {
  29. content += ("|第"+i+":"+el.text());
  30. i++;
  31. }
  32. data = new DataBean(id,title,url,content,type,city,images);
  33. //用xml存储数据
  34. Dom4jUtil.createFile("data/["+type+"]"+title+".xml", Dom4jUtil.createDocument(data));
  35. } catch (Exception e) {
  36. e.printStackTrace();
  37. }
  38. return data;
  39. }
//门票浏览  url参数 http://www.lvmama.com/dest/lantiancheng
	public static DataBean getWebData1(String url){
		DataBean data = null;
		try {
			Document docdata = Jsoup.connect(url).timeout(20000).get();
			String id = ".quick-menu .last a";
			String city = ".proDetail a";
			String title = ".proDetail h1";
			String content = "#Introduction";
			Elements ele = docdata.select(city);
			city = (ele.text()); 
			ele = docdata.select(title);
			title = (ele.text()).replace("<", "").replace(">", "").replace("\\/", "");
			ele = docdata.select(content);
			content = (ele.text());
			ele = docdata.select(id);
			String[] idary = ele.attr("href").split("\\/");
			id = idary[idary.length-1];
			String type = "景点门票";

			List<String> images = new ArrayList<String>();
			data = new DataBean(id,title,url,content,type,city,images);
			Dom4jUtil.createFile("data/["+type+"]"+title+".xml", Dom4jUtil.createDocument(data));
			
			type = "景点评论";
			content = ".userComments dl dd:eq(3)";
			ele = docdata.select(content);
			content = "";
			int i = 1;
			for (Element el : ele) {
				content += ("|第"+i+":"+el.text());
				i++;
			}
			data = new DataBean(id,title,url,content,type,city,images);
			//用xml存储数据
			Dom4jUtil.createFile("data/["+type+"]"+title+".xml", Dom4jUtil.createDocument(data));
		} catch (Exception e) {
			e.printStackTrace();
		}
		return data;
	}

整个过程分为几部

1、 获取页面的节点对象

  1. Document docdata = Jsoup.connect(url).timeout(20000).get();
Document docdata = Jsoup.connect(url).timeout(20000).get();

2、创建选择器,选择页面节点对象的 text 或 html 。 选择方式跟jquery非常类似。

例如:

String title = ".proDetail h1";

ele = docdata.select(title);

这个就是用来选择 class = proDetail 下 h1 标签的对象。通过 ele.text() 就得到了标签中的文本。

  1. Document docdata = Jsoup.connect(url).timeout(20000).get();
  2. String id = ".quick-menu .last a";
  3. String city = ".proDetail a";
  4. String title = ".proDetail h1";
  5. String content = "#Introduction";
  6. Elements ele = docdata.select(city);
  7. city = (ele.text());
  8. ele = docdata.select(title);
  9. title = (ele.text()).replace("<", "").replace(">", "").replace("\\/", "");
  10. ele = docdata.select(content);
  11. content = (ele.text());
  12. ele = docdata.select(id);
  13. String[] idary = ele.attr("href").split("\\/");
  14. id = idary[idary.length-1];
Document docdata = Jsoup.connect(url).timeout(20000).get();
			String id = ".quick-menu .last a";
			String city = ".proDetail a";
			String title = ".proDetail h1";
			String content = "#Introduction";
			Elements ele = docdata.select(city);
			city = (ele.text()); 
			ele = docdata.select(title);
			title = (ele.text()).replace("<", "").replace(">", "").replace("\\/", "");
			ele = docdata.select(content);
			content = (ele.text());
			ele = docdata.select(id);
			String[] idary = ele.attr("href").split("\\/");
			id = idary[idary.length-1];

3、通过获取的数据,构造java数据对象 。 然后存入xml或txt文件。 如果有需要也可以存入数据库啦。

  1. String type = "景点门票";
  2. List<String> images = new ArrayList<String>();
  3. data = new DataBean(id,title,url,content,type,city,images);
  4. Dom4jUtil.createFile("data/["+type+"]"+title+".xml", Dom4jUtil.createDocument(data));
String type = "景点门票";

			List<String> images = new ArrayList<String>();
			data = new DataBean(id,title,url,content,type,city,images);
			Dom4jUtil.createFile("data/["+type+"]"+title+".xml", Dom4jUtil.createDocument(data));

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值