jsoup爬网页生成excel

本文介绍了一种利用Jsoup库从特定网页抓取数据的方法,并将其转换为JSON格式以便进一步处理。通过示例代码展示了如何连接目标网址、解析HTML内容及获取所需元素。

一个成熟的项目 里面有很多表 已经实现在网页上了 要一起导出并压缩 我擦 这么多 还不如 用爬虫爬 就少些很多字了 于是。。。

	/**
	 * 测试一下 jsoup爬取
	 * 
	 * @throws IOException
	 * 
	 */
	@Test
	public void test() throws IOException {
		Date date = new Date();
		FileReader fileReader = new FileReader("src/main/resources/jsoup_sheet.json");
		String s;
		StringBuilder sb = new StringBuilder();
		BufferedReader br = new BufferedReader(fileReader);
		while ((s = br.readLine()) != null) {
			sb.append(s);
		}
		JSONObject obj = JSONObject.fromObject(sb.toString());
		JSONArray jsonArray = obj.getJSONArray("水闸");

		// 遍历这个类别sheet 找到对应的action
		for (Object object : jsonArray) {
			JSONObject json = JSONObject.fromObject(object);
			try {
				String real = "http://localhost:8080/******/gcTg/" + json.get("action")
						+ "Detail.do?ennmcd=KHD00001082&_" + date.getTime();
				Document doc = Jsoup.connect(real).timeout(8000).get();
				System.out.println(doc);
				Elements leftClass = doc.getElementsByClass("left");
				Elements rightClass = doc.getElementsByClass("right");
				int size = leftClass.size();
				if (size > rightClass.size()) {
					// 值和字段标签的 个数不对等 取小的
					size = rightClass.size();
				}
				System.out.println("**********" + real + ":" + json.getString("name") + "************");
				for (int i = 0; i < size; i++) {
					System.out.print(leftClass.get(i).text() + rightClass.get(i).text());
					if ((i + 1) % 2 == 0) {
						System.out.println();
					}
				}
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
		System.out.println("耗时" + (new Date().getTime() - date.getTime()) / 1000);
	}
/**
	 *
	 * @param fieldList
	 *            字段list
	 * @param valueList
	 *            值list
	 * @param name
	 *            sheetName
	 * @param sheetNum
	 * @param book
	 *            表对象
	 */
	private void setSheet4(List<Element> fieldList, List<Element> valueList, String name, int sheetNum,
			WritableWorkbook book) {
		try {
			WritableSheet sheet = book.createSheet(name, sheetNum);
			WritableCellFormat cf = new WritableCellFormat();
			// 标题
			sheet.mergeCells(0, 0, 3, 0);
			sheet.addCell(new Label(0, 0, name, cf));
			int size = fieldList.size();
			if (size > valueList.size()) {
				// 值和字段标签的 个数不对等 取小的
				size = valueList.size();
			}
			for (int i = 0; i < size; i = i + 2) {
				sheet.addCell(new Label(0, i + 1, fieldList.get(i).text(), cf));
				sheet.addCell(new Label(1, i + 1, valueList.get(i).text(), cf));
				sheet.addCell(new Label(2, i + 1, fieldList.get(i + 1).text(), cf));
				sheet.addCell(new Label(3, i + 1, valueList.get(i + 1).text(), cf));
			}

			sheetNum++;

		} catch (RowsExceededException e) {
			e.printStackTrace();
		} catch (WriteException e) {
			e.printStackTrace();
		}
	}

还有一个json文件 来配置 去那几个网页

{
"暗窦":[{"name":"一般信息","action":"slcmin"},{"name":"","action":"rvcmin"},{"name":"","action":"rvcmin"},{"name":"","action":"rvcmin"},{"name":"","action":"rvcmin"},{"name":"","action":"rvcmin"},{"name":"","action":"rvcmin"}],
"测站":[{"name":"一般信息","action":"slcmin"},{"name":"","action":"rvcmin"},{"name":"","action":"rvcmin"},{"name":"","action":"rvcmin"},{"name":"","action":"rvcmin"},{"name":"","action":"rvcmin"},{"name":"","action":"rvcmin"}],
"城市防洪":[{"name":"一般信息","action":"cpfcmin"}],
"穿堤建筑物":[{"name":"一般信息","action":"pbccmin"},{"name":"","action":"rvcmin"},{"name":"","action":"rvcmin"},{"name":"","action":"rvcmin"},{"name":"","action":"rvcmin"},{"name":"","action":"rvcmin"},{"name":"","action":"rvcmin"}],
"船闸":[{"name":"一般信息","action":"czmin"},{"name":"船闸设计参数","action":"czmin/czsjcs"},{"name":"船闸闸门特征","action":"czmin/czzmtz"},{"name":"船闸工程信息","action":"czmin/czgctx"},{"name":"船闸历史运用记录","action":"czlsjl"}],
"堤段":[{"name":"一般信息","action":"ddinfo"},{"name":"堤段横断面特征值","action":"ddinfo/dktr"},{"name":"堤段水文特征","action":"ddinfo/bsfst"}]
}

转载于:https://my.oschina.net/xlpapapa/blog/1549919

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值