最近一项目需要从某高校教务处网站爬取各教学楼的教室安排数据,网址为:http://202.114.5.131/index.aspx 。
用firebug监视,点击页面上的“查询”按钮后,发现请求url是被加密过的,无法获取。后经人指点,可用Watij (Web Application Testing in Java) 来模拟网页控件操作,获取数据。
Watij支持模拟IE、FireFox,我用的是FireFox,需要从官网上http://sourceforge.net/projects/watij/files/ 下载webspec_xxx.zip,解压,然后把java/dist/目录下的webspec.jar以及lib/目录下的全部jar包引入工程。
代码如下:
package org.itec.classroom;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import org.watij.webspec.dsl.Tag;
import org.watij.webspec.dsl.WebSpec;
public class WatijClassroom {
private static String homepage = "http://202.114.5.131/index.aspx";
//教学楼对应value值,西十二8,东九7,西五5,东十二1
private static String[] building = {"W12", "E9", "W5", "E12"};
private static String[] buildingValue = {"8", "7", "5", "1"};
private static String filePath = "D:\\Classroom\\";
public static void main(String[] args) throws Exception {
//打开网页
WebSpec spec = new WebSpec().mozilla();
spec.open(homepage);
for (int i = 0; i < 4; i ++) {
System.out.println("Fetching data of building " + building[i]);
spec.find.select().with.id("Build").set("value", buildingValue[i]);//选择教学楼
spec.find.input().with.name("btnRightall").click();//选择所有教室
spec.find.input().with.name("Button1").click();//点击查询
int page = 0;
File file;
Tag tag = spec.find.a().with.innerText("后页");
//下载所有页的网页
while (tag.exists()) {
page++;
file = new File(filePath, building[i] + "/" + page + ".html");
BufferedWriter bw = new BufferedWriter(new FileWriter(file));
System.out.println("Downloading page " + page + " ...");
String htmlString = spec.source(); // get the brower's source as a html string
bw.write(htmlString);
bw.close();
tag.click();
tag = spec.find.a().with.innerText("后页");
}
System.out.println("All the pages have been downloaded.");
}
}
}
运行后如下图: